In [1]:
import os

# Directorio con los archivos del corpus
corpus_dir = "cornell movie-dialogs corpus"
lines_file = os.path.join(corpus_dir, "movie_lines.txt")
conversations_file = os.path.join(corpus_dir, "movie_conversations.txt")

In [None]:
# Cada línea tiene 5 campos separados por " +++$+++ "
id2line = {}
with open(lines_file, encoding='utf-8', errors='ignore') as file:
    for line in file:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id = parts[0]
            text = parts[4]
            id2line[line_id] = text

In [3]:
pairs = []
with open(conversations_file, encoding='utf-8', errors='ignore') as file:
    for line in file:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            utterance_ids = eval(parts[3])  # convierte lista de IDs en lista de Python
            for i in range(len(utterance_ids) - 1):
                q = id2line.get(utterance_ids[i])
                a = id2line.get(utterance_ids[i+1])
                if q and a:
                    pairs.append((q.strip(), a.strip()))

In [7]:
import pandas as pd

# Crear el DataFrame
df_chat = pd.DataFrame(pairs, columns=["pregunta", "respuesta"])

# Mostrar las primeras filas
print(df_chat.head())

                                            pregunta  \
0  Can we make this quick?  Roxanne Korrine and A...   
1  Well, I thought we'd start with pronunciation,...   
2  Not the hacking and gagging and spitting part....   
3  You're asking me out.  That's so cute. What's ...   
4  No, no, it's my fault -- we didn't have a prop...   

                                           respuesta  
0  Well, I thought we'd start with pronunciation,...  
1  Not the hacking and gagging and spitting part....  
2  Okay... then how 'bout we try out some French ...  
3                                         Forget it.  
4                                           Cameron.  


In [8]:
df_chat.shape

(221282, 2)

In [None]:
# df_chat.to_csv("dataset_chatbot.csv", index=False)

## Preparación de datos

In [3]:
import pandas as pd

# Cargar el archivo (ajusta el nombre si es necesario)
df = pd.read_csv("dataset_chatbot.csv")

# Mostrar ejemplo
print(df.head())

                                            pregunta  \
0  Can we make this quick?  Roxanne Korrine and A...   
1  Well, I thought we'd start with pronunciation,...   
2  Not the hacking and gagging and spitting part....   
3  You're asking me out.  That's so cute. What's ...   
4  No, no, it's my fault -- we didn't have a prop...   

                                           respuesta  
0  Well, I thought we'd start with pronunciation,...  
1  Not the hacking and gagging and spitting part....  
2  Okay... then how 'bout we try out some French ...  
3                                         Forget it.  
4                                           Cameron.  


In [4]:
import re

def remove_tags(string):
    removelist = ""
    p = re.compile(r'<.*?>')
    result = p.sub('',string) #remove HTML tags
    result = re.sub(r'http\S+', '',result) #remove URLs
    result = re.sub(r'[\W_'+removelist+']', ' ', result) #remove non-alphanumeric characters
    result = result.lower()
    return result

df["pregunta"] = df["pregunta"].apply(lambda cw : remove_tags(cw))
df["respuesta"] = df["respuesta"].apply(lambda cw : remove_tags(cw))
df.head()

Unnamed: 0,pregunta,respuesta
0,can we make this quick roxanne korrine and a...,well i thought we d start with pronunciation ...
1,well i thought we d start with pronunciation ...,not the hacking and gagging and spitting part ...
2,not the hacking and gagging and spitting part ...,okay then how bout we try out some french ...
3,you re asking me out that s so cute what s ...,forget it
4,no no it s my fault we didn t have a prop...,cameron


In [5]:
df["respuesta"] = df["respuesta"].apply(lambda txt: "<start> " + txt + " <end>")

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combinar pregunta y respuesta para tokenizar todo el vocabulario
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df["pregunta"].tolist() + df["respuesta"].tolist())

# Mapear texto a secuencias
secuencias_preguntas = tokenizer.texts_to_sequences(df["pregunta"])
secuencias_respuestas = tokenizer.texts_to_sequences(df["respuesta"])

In [7]:
max_len_input = max(len(seq) for seq in secuencias_preguntas)
max_len_output = max(len(seq) for seq in secuencias_respuestas)

X = pad_sequences(secuencias_preguntas, maxlen=max_len_input, padding='post')
Y = pad_sequences(secuencias_respuestas, maxlen=max_len_output, padding='post')

In [8]:
import numpy as np

Y_input = Y[:, :-1]  # todo menos la última palabra
Y_target = Y[:, 1:]  # todo menos la primera palabra


In [9]:
vocab_size = len(tokenizer.word_index) + 1  # +1 por el padding
print(f"Tamaño del vocabulario: {vocab_size}")
print(f"Longitud máxima de input: {max_len_input}")
print(f"Longitud máxima de output: {max_len_output}")

Tamaño del vocabulario: 49237
Longitud máxima de input: 319
Longitud máxima de output: 584


## Definición del modelo

In [13]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, AdditiveAttention, Concatenate
from tensorflow.keras.models import Model

# Parámetros
embedding_dim = 256
units = 512

# Input del encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(units, return_sequences=True, return_state=True)(encoder_embedding)

# Input del decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm_outputs, _, _ = LSTM(units, return_sequences=True, return_state=True)(
    decoder_embedding, initial_state=[state_h, state_c]
)

# Atención
attention = AdditiveAttention()
context_vector = attention([decoder_lstm_outputs, encoder_outputs])

# Combinar contexto y salida del decoder
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_lstm_outputs])

# Capa final
output = Dense(vocab_size, activation="softmax")(decoder_combined_context)

# Modelo final
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
Y_target_expanded = np.expand_dims(Y_target, -1)

model.fit([X, Y_input], Y_target_expanded,
          batch_size=64, epochs=20, validation_split=0.1)