In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import re
import pandas as pd
import numpy as np
import random
import json
from tqdm.notebook import tqdm
from typing import List, Any, Iterable

from datasets import list_datasets, load_dataset

In [2]:
path = r"dataset"
os.listdir(path)

['Chatbots Intent Recognition Dataset', 'Dataset for chatbot']

In [3]:
ds_1 = os.listdir(os.path.join(path,os.listdir(path)[0]))
ds_2 = os.listdir(os.path.join(path,os.listdir(path)[1]))

#### Dataframe

In [4]:
df_dialogs = pd.read_csv(os.path.join(os.path.join(path,os.listdir(path)[1]), ds_2[0]), delimiter="\t", 
                         names=['sentence_1', "sentence_2"], header=None
                        )
display(df_dialogs)

Unnamed: 0,sentence_1,sentence_2
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


#### JSON

In [5]:
with open(os.path.join(os.path.join(path,os.listdir(path)[0]), ds_1[0]), 'r') as f:
    js_data = json.load(f)

js_data

{'intents': [{'intent': 'Greeting',
   'text': ['Hi',
    'Hi there',
    'Hola',
    'Hello',
    'Hello there',
    'Hya',
    'Hya there'],
   'responses': ['Hi human, please tell me your AVTI user',
    'Hello human, please tell me your AVTI user',
    'Hola human, please tell me your AVTI user'],
   'extension': {'function': '', 'entities': False, 'responses': []},
   'context': {'in': '', 'out': 'GreetingUserRequest', 'clear': False},
   'entityType': 'NA',
   'entities': []},
  {'intent': 'GreetingResponse',
   'text': ['My user is Adam',
    'This is Adam',
    'I am Adam',
    'It is Adam',
    'My user is Bella',
    'This is Bella',
    'I am Bella',
    'It is Bella'],
   'responses': ['Great! Hi <HUMAN>! How can I help?',
    'Good! Hi <HUMAN>, how can I help you?',
    'Cool! Hello <HUMAN>, what can I do for you?',
    'OK! Hola <HUMAN>, how can I help you?',
    'OK! hi <HUMAN>, what can I do for you?'],
   'extension': {'function': 'extensions.gHumans.updateHuman',
    

In [6]:
dizio = {}

for intent in tqdm(js_data['intents']):
    text, response = [], []
    for txt in intent['text']:
        text.append(txt)
    for res in intent['responses']:
        response.append(res)
    dizio[intent['intent']] = {"text":text, "responses": response}

100%|██████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 105698.38it/s]


In [7]:
dizio

{'Greeting': {'text': ['Hi',
   'Hi there',
   'Hola',
   'Hello',
   'Hello there',
   'Hya',
   'Hya there'],
  'responses': ['Hi human, please tell me your AVTI user',
   'Hello human, please tell me your AVTI user',
   'Hola human, please tell me your AVTI user']},
 'GreetingResponse': {'text': ['My user is Adam',
   'This is Adam',
   'I am Adam',
   'It is Adam',
   'My user is Bella',
   'This is Bella',
   'I am Bella',
   'It is Bella'],
  'responses': ['Great! Hi <HUMAN>! How can I help?',
   'Good! Hi <HUMAN>, how can I help you?',
   'Cool! Hello <HUMAN>, what can I do for you?',
   'OK! Hola <HUMAN>, how can I help you?',
   'OK! hi <HUMAN>, what can I do for you?']},
 'CourtesyGreeting': {'text': ['How are you?',
   'Hi how are you?',
   'Hello how are you?',
   'Hola how are you?',
   'How are you doing?',
   'Hope you are doing well?',
   'Hello hope you are doing well?'],
  'responses': ['Hello, I am great, how are you? Please tell me your AVTI user',
   'Hello, how ar

In [8]:
list_of_intent = [
                  "Greeting", "GreetingResponse", "CourtesyGreeting", "CourtesyGreetingResponse", "CurrentHumanQuery",
                  "NameQuery", "RealNameQuery", "TimeQuery", "Thanks", "NotTalking2U", "UnderstandQuery", "Shutup",
                  "Swearing", "GoodBye", "CourtesyGoodBye", "WhoAmI", "Clever", "Gossip", "Jokes", "SelfAware"
                  
                 ]

In [9]:
all_texts, all_responses = [], []

random.seed(11)
for inte in tqdm(list_of_intent):
    t = dizio[inte]["text"]
    r = dizio[inte]["responses"]
    if len(t) > len(r):
        sampling = list(np.random.choice(r, len(t)-len(r)))
        [r.append(s) for s in sampling]
    elif len(t) < len(r):
        sampling = list(np.random.choice(t, len(r)-len(t)))
        [t.append(s) for s in sampling]
    all_texts.append(t)
    all_responses.append(r)

100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 19742.55it/s]


In [10]:
def flatten(lst: List[Any]) -> Iterable[Any]:
    """Flatten a list using generators comprehensions.
        Returns a flattened version of list lst.
    """

    for sublist in lst:
        if isinstance(sublist, list):
            for item in sublist:
                yield item
        else:
            yield sublist

In [11]:
all_texts = list(flatten(all_texts))
all_responses = list(flatten(all_responses))

len(all_texts), len(all_responses)

(360, 360)

In [12]:
df_dialogs_2 = pd.DataFrame({"sentence_1":all_texts, "sentence_2":all_responses})
display(df_dialogs_2)

Unnamed: 0,sentence_1,sentence_2
0,Hi,"Hi human, please tell me your AVTI user"
1,Hi there,"Hello human, please tell me your AVTI user"
2,Hola,"Hola human, please tell me your AVTI user"
3,Hello,"Hola human, please tell me your AVTI user"
4,Hello there,"Hello human, please tell me your AVTI user"
...,...,...
355,Can you prove you have a conscious,"That depends, can you prove that you are?"
356,Can you prove you are self-aware please,"That is an difficult question, can you prove t..."
357,Can you prove you are self aware please,"That is an interesting question, can you prove..."
358,Can you prove you have a conscious please,"That is an interesting question, can you prove..."


#### Final DataFrame

In [13]:
df = pd.concat([df_dialogs, df_dialogs_2], ignore_index=True)
display(df)

Unnamed: 0,sentence_1,sentence_2
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
4080,Can you prove you have a conscious,"That depends, can you prove that you are?"
4081,Can you prove you are self-aware please,"That is an difficult question, can you prove t..."
4082,Can you prove you are self aware please,"That is an interesting question, can you prove..."
4083,Can you prove you have a conscious please,"That is an interesting question, can you prove..."


In [14]:
df.to_csv("dataset/processed_df.csv", sep="\t", index=False)

#### Hugging face dataset

In [2]:
datasets_list = list_datasets(with_details=True)
datasets_list

[DatasetInfo: {
 	id: acronym_identification
 	sha: 6e4e8bda901160e9e0b8ce47ca791607f08ce72c
 	lastModified: 2022-07-01T11:49:45.000Z
 	tags: ['arxiv:2010.14678', 'annotations_creators:expert-generated', 'language_creators:found', 'language:en', 'license:mit', 'multilinguality:monolingual', 'size_categories:10K<n<100K', 'source_datasets:original', 'task_categories:token-classification', 'task_ids:token-classification-other-acronym-identification']
 	private: False
 	author: None
 	description: Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.
 	citation: @inproceedings{veyseh-et-al-2020-what,
    title={{What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation}},
    author={Amir Pouran Ben Veyseh and Franck Dernoncourt and Quan Hung Tran and Thien Huu Nguyen},
    year={2020},
    booktitle={Proceedings of COLING},
    link={https://arxiv.org/pdf/2010.14678v1.pdf}
 }
 	cardData: {'annota

In [3]:
dataset = load_dataset('empathetic_dialogues','binary')

Using custom data configuration binary
Found cached dataset empathetic_dialogues (C:/Users/Davide/.cache/huggingface/datasets/empathetic_dialogues/binary/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})

In [5]:
dataset.set_format(type='pandas', columns=['conv_id',  'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 
                                    'selfeval', 'tags']
                   )

Training

In [6]:
df_train = dataset['train'][0]
df_train

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,


In [7]:
for i in tqdm(range(1, len(dataset['train']))):
    df_train = df_train.append(dataset['train'][i])

  0%|          | 0/76672 [00:00<?, ?it/s]

In [8]:
df_train.to_csv("dataset/HuggingFace Datasets/empathetic_dialogues_Train.csv", index=False)

In [9]:
df_train

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,
0,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,
0,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,
0,hit:0_conv:1,4,sentimental,I remember going to the fireworks with my best...,0,Where has she gone?,5|5|5_2|2|5,
0,hit:0_conv:1,5,sentimental,I remember going to the fireworks with my best...,1,We no longer talk.,5|5|5_2|2|5,
...,...,...,...,...,...,...,...,...
0,hit:12424_conv:24848,5,sentimental,I found some pictures of my grandma in the att...,389,Yeah reminds me of the good old days. I miss ...,5|5|5_5|5|5,
0,hit:12424_conv:24849,1,surprised,I woke up this morning to my wife telling me s...,294,I woke up this morning to my wife telling me s...,5|5|5_5|5|5,
0,hit:12424_conv:24849,2,surprised,I woke up this morning to my wife telling me s...,389,Oh hey that's awesome! That is awesome right?,5|5|5_5|5|5,
0,hit:12424_conv:24849,3,surprised,I woke up this morning to my wife telling me s...,294,It is soooo awesome. We have been wanting a b...,5|5|5_5|5|5,


Validation

In [10]:
df_validation = dataset['validation'][0]
df_validation

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:3_conv:6,1,terrified,Today_comma_as i was leaving for work in the m...,6,Today_comma_as i was leaving for work in the m...,4|5|5_5|5|5,


In [11]:
for i in tqdm(range(1, len(dataset['validation']))):
    df_validation = df_validation.append(dataset['validation'][i])

  0%|          | 0/12029 [00:00<?, ?it/s]

In [12]:
df_validation.to_csv("dataset/HuggingFace Datasets/empathetic_dialogues_Validation.csv", index=False)

In [13]:
df_validation

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:3_conv:6,1,terrified,Today_comma_as i was leaving for work in the m...,6,Today_comma_as i was leaving for work in the m...,4|5|5_5|5|5,
0,hit:3_conv:6,2,terrified,Today_comma_as i was leaving for work in the m...,7,Are you fine now?,4|5|5_5|5|5,
0,hit:3_conv:6,3,terrified,Today_comma_as i was leaving for work in the m...,6,Yeah_comma_i'm doing alright now_comma_ but wi...,4|5|5_5|5|5,
0,hit:3_conv:6,4,terrified,Today_comma_as i was leaving for work in the m...,7,Cool :) Is your car damaged a lot?,4|5|5_5|5|5,<IRREGULAR_COLON_FORMAT>
0,hit:3_conv:6,5,terrified,Today_comma_as i was leaving for work in the m...,6,The car was badly damaged_comma_i veered outsi...,4|5|5_5|5|5,
...,...,...,...,...,...,...,...,...
0,hit:12361_conv:24722,4,prepared,One time I studied all night for my final exam!,46,tha is really cool what was your grade,4|4|5_4|4|3,
0,hit:12392_conv:24785,1,furious,One of my coworkers has been arguing with his ...,791,One of my coworkers has been arguing with his ...,4|5|5_5|5|5,
0,hit:12392_conv:24785,2,furious,One of my coworkers has been arguing with his ...,829,What are they arguing about?,4|5|5_5|5|5,
0,hit:12392_conv:24785,3,furious,One of my coworkers has been arguing with his ...,791,Everything and anything. It's annoying_comma_ ...,4|5|5_5|5|5,


Test

In [14]:
df_test = dataset['test'][0]
df_test

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:0,1,guilty,I felt guilty when I was driving home one nigh...,0,Yeah about 10 years ago I had a horrifying exp...,2|2|5_5|5|5,


In [15]:
for i in tqdm(range(1, len(dataset['test']))):
    df_test = df_test.append(dataset['test'][i])

  0%|          | 0/10942 [00:00<?, ?it/s]

In [16]:
df_test.to_csv("dataset/HuggingFace Datasets/empathetic_dialogues_Test.csv", index=False)

In [17]:
df_test

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:0,1,guilty,I felt guilty when I was driving home one nigh...,0,Yeah about 10 years ago I had a horrifying exp...,2|2|5_5|5|5,
0,hit:0_conv:0,2,guilty,I felt guilty when I was driving home one nigh...,1,Did you suffer any injuries?,2|2|5_5|5|5,
0,hit:0_conv:0,3,guilty,I felt guilty when I was driving home one nigh...,0,No I wasn't hit. It turned out they were drunk...,2|2|5_5|5|5,
0,hit:0_conv:0,4,guilty,I felt guilty when I was driving home one nigh...,1,Why did you feel guilty? People really shouldn...,2|2|5_5|5|5,
0,hit:0_conv:0,5,guilty,I felt guilty when I was driving home one nigh...,0,I don't know I was new to driving and hadn't e...,2|2|5_5|5|5,
...,...,...,...,...,...,...,...,...
0,hit:12416_conv:24832,4,disgusted,I saw a huge cockroach outside my house today....,46,I live in Texas to so i know those feels,5|5|5_4|3|4,
0,hit:12423_conv:24847,1,anxious,I have a big test on Monday. I am so nervous_c...,481,I have a big test on Monday_comma_ I am so ner...,5|5|5_5|5|5,
0,hit:12423_conv:24847,2,anxious,I have a big test on Monday. I am so nervous_c...,375,What is the test on?,5|5|5_5|5|5,
0,hit:12423_conv:24847,3,anxious,I have a big test on Monday. I am so nervous_c...,481,It's for my Chemistry class. I haven't slept m...,5|5|5_5|5|5,


#### Final Dataframe V.2

Previous final df + train

In [38]:
df_train_ = df_train[["prompt", "utterance"]]
df_train_.rename({"prompt":"sentence_1", "utterance":"sentence_2"}, axis=1, inplace=True)
df_validation_ = df_validation[["prompt", "utterance"]]
df_validation_.rename({"prompt":"sentence_1", "utterance":"sentence_2"}, axis=1, inplace=True)
df_test_ = df_test[["prompt", "utterance"]]
df_test_.rename({"prompt":"sentence_1", "utterance":"sentence_2"}, axis=1, inplace=True)

In [40]:
df_train_['sentence_1'] =  df_train_['sentence_1'].apply(lambda x: re.sub('_comma_',',', str(x)))
df_train_['sentence_2'] =  df_train_['sentence_2'].apply(lambda x: re.sub('_comma_',',', str(x)))
df_validation_['sentence_1'] =  df_validation_['sentence_1'].apply(lambda x: re.sub('_comma_',',', str(x)))
df_validation_['sentence_2'] =  df_validation_['sentence_2'].apply(lambda x: re.sub('_comma_',',', str(x)))
df_test_['sentence_1'] =  df_test_['sentence_1'].apply(lambda x: re.sub('_comma_',',', str(x)))
df_test_['sentence_2'] =  df_test_['sentence_2'].apply(lambda x: re.sub('_comma_',',', str(x)))

In [42]:
df = pd.read_csv("dataset/processed_df.csv", sep='\t')

In [43]:
df

Unnamed: 0,sentence_1,sentence_2
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
4080,Can you prove you have a conscious,"That depends, can you prove that you are?"
4081,Can you prove you are self-aware please,"That is an difficult question, can you prove t..."
4082,Can you prove you are self aware please,"That is an interesting question, can you prove..."
4083,Can you prove you have a conscious please,"That is an interesting question, can you prove..."


In [44]:
df_v2 = pd.concat([df, df_train_, df_validation_, df_test_])
display(df_v2)

Unnamed: 0,sentence_1,sentence_2
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
0,I saw a huge cockroach outside my house today....,I live in Texas to so i know those feels
0,"I have a big test on Monday. I am so nervous, ...","I have a big test on Monday, I am so nervous."
0,"I have a big test on Monday. I am so nervous, ...",What is the test on?
0,"I have a big test on Monday. I am so nervous, ...",It's for my Chemistry class. I haven't slept m...


In [45]:
df_v2.to_csv("dataset/processed_df_V2.csv", sep="\t", index=False)