## LSTM Bot QA

El objetivo de este notebook es utilizar los datos del challenge ConvAI2 (Conversational Intelligence Challenge 2) con conversaciones en inglés para desarrollar un chatbot capaz de responder preguntas de usuario.

### Librerías

In [1]:
!pip install gdown

import os
import json
import re

import numpy as np
import pandas as pd

import gdown


[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting beautifulsoup4 (from gdown)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting filelock (from gdown)
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting requests[socks] (from gdown)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Using cached soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Collecting charset-normalizer<4,>=2 (from requests[socks]->gdown)
  Using cached charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl.metadata (34 kB)
Collecting idna<4,>=2.5 (from requests[socks]->gdown)
  Using cached idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting certifi>=2017.4.17 (from requests[socks]->gdown)
  Downloading certifi-2024.6.2-py3-none-any.whl.metadata (2.2 kB)
Collecting PySocks!=1.5.7,>=1.5.6 (from requests[socks]->gdown)
  Downloading PySocks-1.7.1-py3-none-a

### Carga de datos

In [2]:
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print('El dataset ya se encuentra descargado')

text_file = 'data_volunteers.json'
with open(text_file) as f:
    data = json.load(f)

Downloading...
From: https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download
To: c:\Users\Enzo\Documents\CEIA - UBA\Procesamiento del Lenguaje Natural\NLP_Challenges\Challenge 4\data_volunteers.json
100%|██████████| 2.58M/2.58M [00:00<00:00, 6.44MB/s]


In [3]:
# Observar los campos disponibles en cada linea del dataset
data[0].keys()

dict_keys(['dialog', 'start_time', 'end_time', 'bot_profile', 'user_profile', 'eval_score', 'profile_match', 'participant1_id', 'participant2_id'])

In [7]:
chat_in = []
chat_out = []

input_sentences = []
output_sentences = []
output_sentences_inputs = []
max_len = 30

def clean_text(txt):
    txt = txt.lower()    
    txt.replace('\'d', ' had')
    txt.replace('\'s', ' is')
    txt.replace('\'m', ' am')
    txt.replace('don\'t', 'do not')
    txt = re.sub(r'\W+', ' ', txt)
    
    return txt

for line in data:
    for i in range(len(line['dialog']) - 1):
        # vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i + 1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in, chat_out
        
        # output sentence (decoder_output) tiene <eos>
        output_sentence = output + ' <eos>'
        # output sentence input (decoder_input) tiene <sos>
        output_sentence_input = '<sos> ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

print('Cantidad de rows utilizadas:', len(input_sentences))

Cantidad de rows utilizadas: 6033


In [8]:
input_sentences[1], output_sentences[1], output_sentences_inputs[1]

('hi how are you ', 'not bad and you  <eos>', '<sos> not bad and you ')

### Preprocesamiento

In [18]:
# ESTA CELDA ESTÁ MAL. USAR TOKENIZER?

# Diccionarios para mapear índices a palabras
word2idx_inputs = {k: v for v, k in enumerate(input_sentences)}
word2idx_outputs = {k: v for v, k in enumerate(output_sentences)}
word2idx_outputs_inputs = {k: v for v, k in enumerate(output_sentences_inputs)}

# Diccionarios para mapear palabras a índices
idx2word_inputs = {v: k for k, v in word2idx_inputs.items()}
idx2word_outputs = {v: k for k, v in word2idx_outputs.items()}
idx2word__outputs_inputs = {v: k for k, v in word2idx_outputs_inputs.items()}

print('Palabras a índices:')
print(list(word2idx_inputs.items())[:5])
print(list(word2idx_outputs.items())[:5])
print(list(word2idx_outputs_inputs.items())[:5])

print('\nÍndices a palabras:')
print(list(idx2word_inputs.items())[:5])
print(list(idx2word_outputs.items())[:5])
print(list(idx2word__outputs_inputs.items())[:5])

Palabras a índices:
[('hello ', 5432), ('hi how are you ', 5763), ('hi ', 5638), ('where are you working ', 5), ('bro ', 6)]
[('hi how are you  <eos>', 5367), ('not bad and you  <eos>', 1), ('hello  <eos>', 5882), ('hello how are you today  <eos>', 5807), ('bro  <eos>', 5118)]
[('<sos> hi how are you ', 5367), ('<sos> not bad and you ', 1), ('<sos> hello ', 5882), ('<sos> hello how are you today ', 5807), ('<sos> bro ', 5118)]

Índices a palabras:
[(5432, 'hello '), (5763, 'hi how are you '), (5638, 'hi '), (5, 'where are you working '), (6, 'bro ')]
[(5367, 'hi how are you  <eos>'), (1, 'not bad and you  <eos>'), (5882, 'hello  <eos>'), (5807, 'hello how are you today  <eos>'), (5118, 'bro  <eos>')]
[(5367, '<sos> hi how are you '), (1, '<sos> not bad and you '), (5882, '<sos> hello '), (5807, '<sos> hello how are you today '), (5118, '<sos> bro ')]


In [None]:
#encoder_input_sequences = 