# LSAP Data Preprocessing
---
This notebook contains code for combining the pretraining datasets and creating a training, testing, and validation splits for them.

In [5]:
from dataclasses import dataclass
import os, json
import pandas as pd

def green(text):
    return f"\033[92m\033[1m{text}\033[0m"

def highlight(text):
    return f'\x1b[6;30;42m{text}\x1b[0m'


#Path settings
CURR_PATH = ""
DATA_PATH = os.path.join( CURR_PATH, 'dataset' )

#Where to save the data
csv_cache  = f"{DATA_PATH}/csv"
json_cache = f"{DATA_PATH}/json"

def create_dir( path ):
    os.makedirs( path, exist_ok=True )

#Create directories
create_dir( csv_cache )
create_dir( json_cache )

#Path settings
JSON_PATH  = json_cache
COMBINED_JSON_PATH = f"{ JSON_PATH }/combined"

# All data folders
data_folders = ['ATIS', 'SNIPS', 'TOPS_Reminder', 'TOPS_Weather']

### Data Handler
---
Below, we use a class to easily handle writing/reading/preprocessing data.

In [6]:
@dataclass
class DFHandler:
    folder_name: str

    def __post_init__(self):
        self.data = pd.DataFrame()

    def add_data(self, data):
        self.data = pd.concat([self.data, data])

    def get_data(self):
        """Gets the data from the data folder."""
        curr_folder = os.path.join(CURR_PATH, self.folder_name)
        files = os.listdir(f'{curr_folder}/data')
        return {file: pd.read_csv(f'{curr_folder}/data/{file}') for file in files}
        
    
    def write_to_json( self, df, output_file ):
        with open( output_file, 'w' ) as out_data:
            for _, row in df.iterrows():
                utterance = row["text"]
                intent    = row["intent"]

                json_obj = json.dumps({"translation":
                    {"src": utterance, "tgt": intent, "prefix": "intent classification: "}
                })
                out_data.write(json_obj + '\n')

    def create_datasets( self ):
        all_data = self.get_data()
        for file, df in all_data.items():
                       
            #Create naming scheme and remove ".csv"
            folder_file = f"{self.folder_name}_{file}".replace(".csv", "")

            #Check if folder exists
            if not os.path.exists( f"{csv_cache}/{self.folder_name}" ):
                os.makedirs( f"{csv_cache}/{self.folder_name}" )

            if not os.path.exists( f"{json_cache}/{self.folder_name}" ):
                os.makedirs( f"{json_cache}/{self.folder_name}" )

            #Save to csv and json
            df.to_csv( f'{csv_cache}/{self.folder_name}/{folder_file}.csv' )
            self.write_to_json( df, f'{json_cache}/{self.folder_name}/{folder_file}.json' )

### Write to JSON
---

Below, we write the same datasets to their respective JSON files.

In [7]:
for folder in data_folders:
    print(green(f'Creating dataset for {folder}'))
    dh = DFHandler( folder )
    dh.create_datasets()
    print(f"Dataset created for {folder}.\nCSV Location: {csv_cache}/{folder}\nJSON Location: {json_cache}/{folder}\n")

[92m[1mCreating dataset for ATIS[0m
Dataset created for ATIS.
CSV Location: dataset/csv/ATIS
JSON Location: dataset/json/ATIS

[92m[1mCreating dataset for SNIPS[0m
Dataset created for SNIPS.
CSV Location: dataset/csv/SNIPS
JSON Location: dataset/json/SNIPS

[92m[1mCreating dataset for TOPS_Reminder[0m
Dataset created for TOPS_Reminder.
CSV Location: dataset/csv/TOPS_Reminder
JSON Location: dataset/json/TOPS_Reminder

[92m[1mCreating dataset for TOPS_Weather[0m
Dataset created for TOPS_Weather.
CSV Location: dataset/csv/TOPS_Weather
JSON Location: dataset/json/TOPS_Weather

