# Formattage des données dans le format approprié à être consommé par LUIS

## Imports

In [1]:
# Import des librairies
import json
import pandas as pd

In [2]:
# Import des données
data_path = r"D:\Data\Google Drive\Openclassrooms\P10\data\frames.json"
df = pd.read_json(data_path)

## Analyse exploratoire des données

**Exemples d'enregistrements du fichier de données**

In [3]:
df.head()

Unnamed: 0,user_id,turns,wizard_id,id,labels
0,U22HTHYNP,[{'text': 'I'd like to book a trip to Atlantis...,U21DKG18C,e2c0fc6c-2134-4891-8353-ef16d8412c9a,"{'userSurveyRating': 4.0, 'wizardSurveyTaskSuc..."
1,U21E41CQP,"[{'text': 'Hello, I am looking to book a vacat...",U21DMV0KA,4a3bfa39-2c22-42c8-8694-32b4e34415e9,"{'userSurveyRating': 3.0, 'wizardSurveyTaskSuc..."
2,U21RP4FCY,[{'text': 'Hello there i am looking to go on a...,U21E0179B,6e67ed28-e94c-4fab-96b6-68569a92682f,"{'userSurveyRating': 2.0, 'wizardSurveyTaskSuc..."
3,U22HTHYNP,[{'text': 'Hi I'd like to go to Caprica from B...,U21DKG18C,5ae76e50-5b48-4166-9f6d-67aaabd7bcaa,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."
4,U21E41CQP,"[{'text': 'Hello, I am looking to book a trip ...",U21DMV0KA,24603086-bb53-431e-a0d8-1dcc63518ba9,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."


**Exploration de la structure des conversations (colonne 'turns')**

In [4]:
# Structure du fichier json
utterance_keys = set()
labels_keys = set()
acts_keys = set()
acts_names = set()
args_keys = set()
args_vals = set()
intent_vals = set()

for dialog in df['turns'].to_list():
    for utterance in dialog:
        for key in list(utterance.keys()):
            utterance_keys.add(key)
        if utterance['author'] == 'user':
            for key in list(utterance['labels'].keys()):
                labels_keys.add(key)
            for acts in utterance['labels']['acts']:
                for key in list(acts.keys()):
                    acts_keys.add(key)
                acts_names.add(acts['name'])
                for args in acts['args']:
                    args_keys.add(args['key'])
                    if args['key'] == 'intent':
                        intent_vals.add(args['val'])
                    try:
                        args_vals.add(args['val'])
                    except:
                        pass

print(f"utterance keys => {utterance_keys}")
print(f"\nlabels keys => {labels_keys}")
print(f"\nacts keys => {acts_keys}")
print(f"\nacts names => {acts_names}")
print(f"\nargs keys => {args_keys}")
# print(f"\nargs values => {args_vals}")
print(f"\nintent values => {intent_vals}")

utterance keys => {'timestamp', 'db', 'author', 'text', 'labels'}

labels keys => {'acts_without_refs', 'frames', 'acts', 'active_frame'}

acts keys => {'name', 'args'}

acts names => {'negate', 'thankyou', 'request_alts', 'switch_frame', 'inform', 'request_compare', 'request', 'confirm', 'goodbye', 'greeting', 'moreinfo', 'affirm'}

args keys => {'ref_anaphora', 'gst_rating', 'market', 'shopping', 'max_duration', 'budget_ok', 'price', 'university', 'cathedral', 'gym', 'action', 'n_children', 'ref', 'downtown', 'count_seat', 'dep_time_dst', 'str_date', 'parking', 'beach', 'arr_time_dst', 'n_adults_ok', 'wifi', 'dst_city', 'category', 'flex', 'name', 'dep_time_or', 'duration', 'end_date', 'seat_ok', 'n_adults', 'count_dst_city', 'airport', 'seat', 'intent_ok', 'museum', 'mall', 'vicinity', 'park', 'impl_anaphora', 'count', 'min_duration', 'breakfast', 'palace', 'spa', 'or_city', 'budget', 'count_name', 'arr_time_or', 'intent', 'amenities'}

intent values => {'book'}


In [5]:
df['turns'][0][6]

{'text': "I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help",
 'labels': {'acts': [{'args': [], 'name': 'thankyou'}],
  'acts_without_refs': [{'args': [], 'name': 'thankyou'}],
  'active_frame': 3,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'budget': [{'val': '1700.0', 'negated': False}],
     'dst_city': [{'val': 'Atlantis', 'negated': False}],
     'or_city': [{'val': 'Caprica', 'negated': True}],
     'str_date': [{'val': 'august 13', 'negated': False}],
     'n_adults': [{'val': '8', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []},
   {'info': {'intent': [{'val': 'book', 'negated': False}],
     'budget': [{'val': '1900.0', 'negated': False}],
     'dst_city': [{'val': 'Neverland', 'negated': False}],
     'or_city': [{'val': 'Caprica', 'negated': False}],
     'str_date': [{'val'

**Extraction des dialogues de meilleure qualité**

In [6]:
# Nombre de dialogues en fonction de leur rating
user_ratings = [df['labels'][i]['userSurveyRating'] for i in range(len(df))]
df['user_rating'] = user_ratings
for i in range(1,6):
    print(i, len(df[df['user_rating'] == i]))

1 25
2 28
3 83
4 215
5 982


In [7]:
# Filtrer les dialogues ayant un rating de 5
df5 = df[df['user_rating'] == 5]
df5 = df5.reset_index(drop=True)

**Exemple de dialogue**

In [8]:
# Imprimer un dialogue
i = 1
conv = df5.loc[i]['turns']
for i in range(len(conv)):
    print(f"{i}, [{conv[i]['author']:>6}]: {conv[i]['text']}")
    acts_names = set()
    args_keys = set()
    for act in conv[i]['labels']['acts']:
        acts_names.add(act['name'])
        for arg in act['args']:
            args_keys.add(arg['key'])
    print(f"{acts_names}")
    print(f"{args_keys}")

0, [  user]: Hello, I am looking to book a trip for 2 adults and 6 children for $21,300 or less. We are departing from Kochi for Denver.
{'inform', 'greeting'}
{'dst_city', 'n_adults', 'budget', 'or_city', 'n_children', 'intent'}
1, [wizard]: I have several options available within your budget. How long would you like to travel for? And do you have dates in mind?
{'request', 'inform'}
{'duration', 'end_date', 'str_date', 'count'}
2, [  user]: I do not have any dates in mind. I would like to spend as much time in Denver as my budget will allow.
{'inform'}
{'end_date', 'str_date', 'ref', 'max_duration'}
3, [wizard]: I can book 7 days at a 4.0 star hotel for 19028.93USD. I can also book 7 days at a 3.0 star hotel for 12824.84USD.
{'offer'}
{'duration', 'id', 'price', 'category'}
4, [  user]: Do these packages have different departure dates? When would I be leaving for each of them?
{'request_compare'}
{'ref', 'str_date'}
5, [wizard]: The 3.0 star trip leaves Kochi August 26 and returns Au

## Définition d'une classe d'exemples de phrases labellisées compatibles avec le format LUIS 
La classe permet d'extraire du dataset initial des exemples particuliers en les formattant pour LUIS.  

Par exemple, le code suivant:  
 
   entities_set = {'or_city', 'str_date', 'dst_city', 'end_date', 'budget'}  
   examples.select_train_utterances(10, 'inform', entities_set)  
   entities_set = {'or_city', 'dst_city'}  
   examples.select_train_utterances(10, 'inform', entities_set)  
   examples.update_test_utterances(30)   

va extraire 10 exemples avec toutes les entités 'or_city', 'str_date', 'dst_city', 'end_date', 'budget'  
puis extrait 10 exemples supplémentaires avec les entités 'or_city', 'dst_city'  
puis extrait 30 exemples de chaque cas pour les tests

In [9]:
# Définition de la classe
class UtteranceExamples():

    def __init__(self, df):

        self.utterances_df = df

        self.text_key = {'train': 'text', 'test': 'text'}
        self.intent_key = {'train': 'intent_name', 'test': 'intent'}
        self.entities_key = {'train': 'entity_labels', 'test': 'entities'}
        self.entity_name_key = {'train': 'entity_name', 'test': 'entity'}
        self.start_char_key = {'train': 'start_char_index', 'test': 'startPos'}
        self.end_char_key = {'train': 'end_char_index', 'test': 'endPos'}
        self.entity_keys_key = {'train': 'entity_keys', 'test': 'entity_keys'}

        self.all_acts = {
            'request_alts','confirm', 'goodbye', 'greeting', 'thankyou', 'request_compare','negate','request', 'switch_frame', 'affirm', 'moreinfo', 'inform',
            }
        self.my_acts = set()

        self. all_entities = {'gst_rating', 'price', 'seat', 'breakfast', 'airport', 'gym', 'university', 'park', 'shopping', 'intent_ok', 'spa', 'vicinity', 'beach', 'name', 'max_duration', 'budget_ok', 'flex', 'mall', 'min_duration', 'parking', 'dep_time_or', 'ref', 'budget', 'palace', 'count_seat', 'impl_anaphora', 'count', 'str_date', 'n_adults_ok', 'end_date', 'n_adults', 'wifi', 'arr_time_dst', 'category', 'count_name', 'action', 'market', 'n_children', 'ref_anaphora', 'cathedral', 'seat_ok', 'amenities', 'museum', 'downtown', 'duration', 'dep_time_dst', 'intent', 'arr_time_or', 'count_dst_city', 'or_city', 'dst_city'}
        self.my_entities = set()

        self.utterances_train_format = []
        self.utterances_test_format = []
        self.utterances_without_keys_train_format = []
        self.utterances_without_keys_test_format = []
        self.train_utterances = []
        self.train_utterances_idx = []
        self.train_params = []
        self.test_utterances = []

    def set_acts(self,acts):
            for act in acts:
                assert act in self.all_acts
            self.my_acts = acts

    def set_entities(self,entities):
        for entity in entities:
            assert entity in self.all_entities
        self.my_entities = entities


    def format_utterances(self, type):
        df = self.utterances_df
        
        for dialog_n in range(len(df)):

            dialog = df['turns'][dialog_n]

            for i in range(len(dialog)):
                if dialog[i]['author'] == 'user':

                    text = dialog[i]['text']
                    acts = dialog[i]['labels']['acts']

                    entities = []
                    entity_keys = set()

                    for act in acts:
                        if act['name'] in self.my_acts:
                            intent = act['name']
                        else:
                            intent = "None"
                        try:
                            for arg in act['args']:
                                if arg['key'] in self.my_entities:
                                    if text.find(arg['val']) != -1:
                                        start = text.find(arg['val'])
                                        end = text.find(arg['val']) + len(arg['val'])
                                        entities.append({self.entity_name_key[type]: arg['key'],
                                                        self.start_char_key[type]: start,
                                                        self.end_char_key[type]: end})
                                        entity_keys.add(arg['key'])
                        except:
                            pass

                    if type == 'train':
                        self.utterances_train_format.append({
                            self.text_key[type]:text, 
                            self.intent_key[type]:intent,
                            self.entities_key[type]:entities,
                            self.entity_keys_key[type]:entity_keys})

                        self.utterances_without_keys_train_format.append({
                            self.text_key[type]:text, 
                            self.intent_key[type]:intent,
                            self.entities_key[type]:entities})

                    if type == 'test':
                        self.utterances_test_format.append({
                            self.text_key[type]:text, 
                            self.intent_key[type]:intent,
                            self.entities_key[type]:entities,
                            self.entity_keys_key[type]:entity_keys})

                        self.utterances_without_keys_test_format.append({
                            self.text_key[type]:text, 
                            self.intent_key[type]:intent,
                            self.entities_key[type]:entities})

    def select_train_utterances(self, n, intent, entities_set):
        if (intent,entities_set) not in self.train_params:
            self.train_params.append((intent, entities_set))
        utterances_df = pd.DataFrame(self.utterances_train_format)
        idx_intent = utterances_df[self.intent_key['train']] == intent
        idx_entity = utterances_df[self.entity_keys_key['train']] == entities_set
        idx = idx_entity & idx_intent
        idx_ = idx[idx].index.to_list()
        if intent == 'None':
            nones_len = int(len(self.train_utterances) * 0.1)
            length = min(nones_len, len(idx_))
        else:
            length = min(n,len(idx_))
        self.train_utterances.extend([self.utterances_without_keys_train_format[i] for i in idx_[0:length]])
        self.train_utterances_idx.extend(idx_[0:length])

    def update_test_utterances(self, n):
        test_idx = [i for i in range(len(self.utterances_train_format)) if i not in self.train_utterances_idx]
        utterances_df = pd.DataFrame(self.utterances_test_format)
        utterances_df = utterances_df.loc[test_idx]
        for (intent, entities_set) in self.train_params:
            idx_intent = utterances_df[self.intent_key['test']] == intent
            idx_entity = utterances_df[self.entity_keys_key['test']] == entities_set
            idx = idx_entity & idx_intent
            idx_ = idx[idx].index.to_list()
            if intent == 'None':
                nones_len = int(len(self.train_utterances) * 0.1)
                length = min(nones_len, len(idx_))
            else:
                length = min(n,len(idx_))
            self.test_utterances.extend([self.utterances_without_keys_test_format[i] for i in idx_[0:length]])


In [11]:
# Production des exemples d'entrainement et de test
examples = UtteranceExamples(df5)
examples.set_acts({'inform'})
examples.set_entities({'or_city', 'str_date', 'dst_city', 'end_date', 'budget'})
examples.format_utterances('train')
examples.format_utterances('test')

entities_set = {'or_city', 'str_date', 'dst_city', 'end_date', 'budget'}
examples.select_train_utterances(10, 'inform', entities_set)

entities_set = {'or_city', 'str_date', 'dst_city', 'end_date'}
examples.select_train_utterances(10, 'inform', entities_set)

entities_set = {'or_city', 'str_date', 'dst_city'}
examples.select_train_utterances(10, 'inform', entities_set)

entities_set = {'or_city', 'dst_city'}
examples.select_train_utterances(10, 'inform', entities_set)

entities_set = set()
examples.select_train_utterances(10, 'inform', entities_set)

entities_set = set()
examples.select_train_utterances(10, 'None', entities_set)

examples.update_test_utterances(30)

with open('./data/train_utterances.json', 'w') as f:
    json.dump(examples.train_utterances, f)

with open('./data/test_utterances.json', 'w') as f:
    json.dump(examples.test_utterances, f)