# Importing Necessary Libraries

In [52]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense,LSTM,Embedding,Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model

# Pre-procesing the data

In [2]:
lines = open('/content/movie_lines.txt',encoding='utf-8',errors='ignore').read().split('\n')
conversations = open('/content/movie_conversations.txt',encoding='utf-8',errors='ignore').read().split('\n')
lines[1:5]

['L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."]

# Mapping the Sentences with the ID 

In [3]:
id_map = {}

for line in lines:
    sen = line.split(' +++$+++ ')
    if len(sen) == 5:
        id_map[sen[0]] = sen[4]

In [4]:
for i, (key, value) in enumerate(id_map.items()):
    if i == 5:
        break
    print(f'{key}: {value}')

L1045: They do not!
L1044: They do to!
L985: I hope so.
L984: She okay?
L925: Let's go.


# Isolating the Conversation ID

In [5]:
conversation_id = []
for conversation in conversations:
    con = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace("[]", "").split(',')
    conversation_id.append(con)

In [6]:
conversation_id[3]

['L204', ' L205', ' L206']

# Spliting Questions and Answers

In [7]:
questions = []
answers = []
for conversation in conversation_id:
    for i in range (len(conversation)-1):
        q_id = conversation[i].strip()
        a_id = conversation[i+1].strip()
        questions.append(id_map[q_id])
        answers.append(id_map[a_id])

In [8]:
print(questions[:4])
print(answers[:4])

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "You're asking me out.  That's so cute. What's your name again?"]
["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?", 'Forget it.']


# Cleaning the data

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

p_questions = [clean_text(q) for q in questions]
p_answers = [clean_text(a) for a in answers]

In [10]:
p_questions[:4]

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 "well i thought we'd start with pronunciation if that is okay with you",
 'not the hacking and gagging and spitting part  please',
 "you're asking me out  that is so cute what is your name again"]

# Tokenization of the Data

In [51]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(p_questions + p_answers)
q_sequence = tokenizer.texts_to_sequences(p_questions)
a_sequence = tokenizer.texts_to_sequences(p_answers)

In [12]:
ip_questions = pad_sequences(q_sequence,maxlen = 50,padding = 'post')
ip_answers = pad_sequences(a_sequence,maxlen= 50, padding = 'post')

# Construction of Model

In [42]:
vocab_size = len(tokenizer.word_index) + 3
embedding_dim = 256
units = 50
batch_size = 64

In [43]:
encoder_input = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size,embedding_dim)(encoder_input)
encoder_lstm = LSTM(units,return_state=True)
encoder_output,h_state,c_state = encoder_lstm(encoder_embedding)
encoder_states = [h_state,c_state]

In [44]:
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size,embedding_dim)(decoder_input)
decoder_ltsm = LSTM(units,return_sequences=True,return_state=True)
decoder_output,_,_ = decoder_ltsm(decoder_embedding,initial_state=encoder_states)
decoder_dense = Dense(vocab_size,activation='softmax')
decoder_output = decoder_dense(decoder_output)

In [45]:
model = Model([encoder_input,decoder_input],
              decoder_output)
model.compile(optimizer = 'rmsprop',
              loss = 'sparse_categorical_crossentropy',metrics = ['accuracy'])

In [46]:
model.summary()

In [47]:
start_token = 68885
end_token = 68886

ip_answers_input = np.zeros_like(ip_answers)
ip_answers_input[:, 1:] = ip_answers[:, :-1]
ip_answers_input[:,0] = start_token

In [48]:
ip_answers_target = ip_answers
for i in range(len(ip_answers_target)):
    ip_answers_target[i, -1] = end_token

# Training of Model

In [50]:
model.fit([ip_questions, ip_answers_input], ip_answers_target,
          batch_size=batch_size,
          epochs=1,
          validation_split=0.2)

[1m2771/2771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 238ms/step - accuracy: 0.7909 - loss: 1.3843 - val_accuracy: 0.7968 - val_loss: 1.3790


<keras.src.callbacks.history.History at 0x7cca5154abf0>

In [53]:
model.save('model.keras')