# LSAP Data Preprocessing
---
This notebook contains code for combining the pretraining datasets and creating a training, testing, and validation splits for them.

In [5]:
import os
import pandas as pd

# set the data folders
data_folders = ['ATIS', 'SNIPS']

#Define if to split the data into train, val, and test
SPLIT = False

#Set train, val, and test split sizes
TRAIN_SPLIT = 0.6
VAL_SPLIT   = 0.2
TEST_SPLIT  = 0.2

### Write to JSON
---

Below, we write the same datasets to their respective JSON files.

In [6]:
#create dictionary to store data
dataframes = { k : pd.DataFrame() for k in data_folders }

# search each folder for 'data.csv' files
for folder in data_folders:
    # get 'data.csv' in the folder
    folder_path = os.path.join( folder, 'data.csv' )

    # read the file
    df = pd.read_csv( folder_path, index_col=0 )

    # add the dataframe to the dictionary
    dataframes[ folder ] = df

    #name the dataframe
    dataframes[ folder ].name = folder

# combine the dataframes
combined_df = pd.concat( dataframes.values() )
combined_df.name = 'combined'

#add it to the data folder
dataframes['combined'] = combined_df

dataframes['combined'].sample( frac = 1 ).head( 10 )

Unnamed: 0,utterance,intent
4470,give me all the flights from new york to miami...,Flight.
129,"Book a reservation for 8 people in Wardville ,...",Book Restaurant.
9206,Rate this album two out of 6 stars,Rate Book.
501,list airports in arizona nevada and california...,Airport.
10098,Show Panic in the Streets,Search Creative Work.
3903,sunday 's flights between tampa and charlotte,Flight.
10977,Play The News Virginian song,Search Creative Work.
5762,Play Punk Essentials on google music,Play Music.
7629,Rate Paradise News 2 out of 6,Rate Book.
12853,When will The Boys Next Door be playing at the...,Search Screening Event.


### Clean up Data

In [10]:
from datasets import  ClassLabel
import json, re

def convert_to_json( csv_file ):
    df = pd.read_csv( csv_file )
    with open( re.sub( r'csv', 'json', csv_file ), 'w' ) as out_data:
        for _, row in df.iterrows():
            utterance = row["utterance"]
            intent = row["intent"]

            json_obj = json.dumps({"translation":
                {"src": utterance, "tgt": intent, "prefix": "intent classification: "}
            })
            out_data.write(json_obj + '\n')

def convert_intent_labels_to_integers(df):
  """Converts the intent labels in a DataFrame to integers.

  Args:
    df: The DataFrame to convert.

  Returns:
    The converted DataFrame.
  """

  # Convert each intent to a ClassLabel.
  labels = df['intent'].unique().tolist()
  ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

  # Append ClassLabels into DataFrame.
  def map_label2id(row):
    return ClassLabels.str2int(row)

  df['label'] = df['intent'].apply(map_label2id)

  # Reset the index of the DataFrame.
  df = df.reset_index(drop=True)

  # Rename the 'text' and 'label_name' columns to 'utterance' and 'intent', respectively.
  df = df.rename(columns={'text': 'utterance', 'label_name': 'intent'})

  return df

### Write to JSON
---

Below, we write the same datasets to their respective JSON files.

In [11]:
for folder, df in dataframes.items():
  dataframes[ folder ] = convert_intent_labels_to_integers( df )
  #save to csv
  dataframes[ folder ].to_csv( f'dataset/csv/{folder}.csv' )
  #save to json
  convert_to_json( f'dataset/csv/{folder}.csv' )

dataframes["combined"].sample( frac = 1 ).head( 10 )

Unnamed: 0,utterance,intent,label
11376,What will be the wind speed around Pembina Gor...,Get Weather.,28
16859,"Please find the movie , A Jingle with Jillian .",Search Creative Work.,31
3591,what are the flights on tuesday october first ...,Flight.,0
20089,Give The Giant Devil Dingo 4 points .,Rate Book.,30
7862,I would like reservations for Cliff House in s...,Book Restaurant.,27
15084,Rate Deception a one,Rate Book.,30
16869,find a soundtrack called Fast as You,Search Creative Work.,31
2912,what airline uses the code hp,Airline.,6
1877,what is the ground transportation available in...,Ground Service.,4
14075,give the current book 0 out of 6 points,Rate Book.,30
