In [159]:
!pip install scikit-learn
!pip install tqdm



# IR training
#### In this notebook Intent Recognition model is trained on a combination of ATIS (Airline Travel Information System), Ubuntu chat, and DailyDialogue datasets. All the data is stored in folder training_data. 

# Data preprocessing
## Since the data is from different sources it must be processed thoroughly

In [1]:
import random
import itertools 
from sklearn.model_selection import train_test_split

In [2]:
ls training_data/

atis_test.json   end_dialogue_data.txt  test.json:Zone.Identifier
atis_train.json  grateful_data.txt      train.json:Zone.Identifier
dialogues.txt    misc.csv


## ATIS data (flight details)

In [3]:
import json
with open("training_data/atis_train.json") as json_file:
    loaded_train_dict = json.load(json_file)
with open("training_data/atis_test.json") as json_file:
    loaded_test_dict = json.load(json_file)

In [4]:
loaded_train_dict = loaded_train_dict['rasa_nlu_data']['common_examples']
loaded_test_dict = loaded_test_dict['rasa_nlu_data']['common_examples']

In [5]:
def prepare_ATIS_data(loaded_dict):
    sorted_dict = {}
    for example in loaded_dict:
        if example['intent'] not in sorted_dict:
            sorted_dict[example['intent']] = []
        sorted_dict[example['intent']].append(example['text'])
    return sorted_dict

In [6]:
flight_train_data = prepare_ATIS_data(loaded_train_dict)['flight']
flight_test_data = prepare_ATIS_data(loaded_test_dict)['flight']
len(flight_train_data), len(flight_test_data)

(3666, 632)

In [7]:
flight_train_labels = [0 for i in range(len(flight_train_data))]
flight_test_labels = [0 for i in range(len(flight_test_data))]
len(flight_train_labels), len(flight_test_labels)

(3666, 632)

## Misc data from unrelated chats (Ubuntu chat and DailyDialog)

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv('./training_data/misc.csv')

In [10]:
misc = df['text'].sample(2500).tolist()

In [11]:
with open('./training_data/dialogues.txt', mode='r') as f:
    dialogues = f.read()
list_ = dialogues.split('__eou__')
dialogues = list_[:2500]

In [12]:
for i in range(len(dialogues)):
    dialogues[i] = dialogues[i].strip()
    if '\n' in dialogues[i]:
        dialogues[i] = dialogues[i].replace('\n', '')
dialogues_processed = [i for i in dialogues if len(i)>15] 
len(dialogues_processed)

2344

In [13]:
misc.extend(dialogues_processed)
misc_labels = [1 for i in range(len(misc))]
len(misc), len(misc_labels)


(4844, 4844)

In [14]:
misc_train, misc_test, misc_train_labels, misc_test_labels = train_test_split(misc,
                                                                              misc_labels,
                                                                              train_size=0.7,
                                                                              shuffle=True, random_state=42) 
len(misc_train),len(misc_test),len(misc_train_labels),len(misc_test_labels),

(3390, 1454, 3390, 1454)

## Grateful messages, if the user wants to thank the model

In [15]:
with open('./training_data/grateful_data.txt', mode='r') as f:
    grateful_messages = f.read()

In [16]:
grateful_messages = grateful_messages.split('\n')
len(grateful_messages)

708

In [17]:
# check for duplicates
grateful = []
for i in grateful_messages:
    if i not in grateful:
        grateful.append(i)

In [18]:
grateful_labels = [2 for i in range(len(grateful))]
len(grateful), len(grateful_labels)

(708, 708)

In [19]:
grateful_train, grateful_test, grateful_train_labels, grateful_test_labels = train_test_split(grateful,
                                                                              grateful_labels,
                                                                              train_size=0.7,
                                                                              shuffle=True, random_state=42) 
len(grateful_train),len(grateful_test),len(grateful_train_labels),len(grateful_test_labels),

(495, 213, 495, 213)

## End of dialogue messages, if the user wants to end the conversation

In [20]:
with open('./training_data/end_dialogue_data.txt', mode='r') as f:
    dialogue_end_texts = f.read()

In [21]:
dialogue_end_texts = dialogue_end_texts.split('\n')

In [22]:
# check for duplicates
dialogue_end = []
for idx in range(len(dialogue_end_texts)):
    i = dialogue_end_texts[idx]
    if '"' in i:
        i = i.replace('"', '')
    if i not in dialogue_end:
        dialogue_end.append(i)

In [23]:
dialogue_end_labels = [3 for i in range(len(dialogue_end))]
len(dialogue_end), len(dialogue_end_labels)

(618, 618)

In [24]:
dialogue_end_train, dialogue_end_test, dialogue_end_train_labels, dialogue_end_test_labels = train_test_split(dialogue_end,
                                                                                              dialogue_end_labels,
                                                                                              train_size=0.7,
                                                                                              shuffle=True, random_state=42) 
len(dialogue_end_train), len(dialogue_end_test), len(dialogue_end_train_labels), len(dialogue_end_test_labels)

(432, 186, 432, 186)

## Combining all together

In [25]:
X_train = list(itertools.chain(flight_train_data, misc_train, grateful_train, dialogue_end_train))
X_test = list(itertools.chain(flight_test_data, misc_test, grateful_test, dialogue_end_test))
Y_train = list(itertools.chain(flight_train_labels, misc_train_labels, grateful_train_labels, dialogue_end_train_labels))
Y_test = list(itertools.chain(flight_test_labels, misc_test_labels, grateful_test_labels, dialogue_end_test_labels))

In [26]:
len(X_train), len(X_test), len(Y_train), len(Y_test)

(7983, 2485, 7983, 2485)

In [28]:
import json
dict_to_save = {}
dict_to_save["X_train"] = X_train
dict_to_save["X_test"] = X_test
dict_to_save["Y_train"] = Y_train
dict_to_save["Y_test"] = Y_test

with open('./training_data/final_dataset.json', 'w') as f:
    json.dump(dict_to_save, f)