In [44]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import spacy
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle as pkl
import random

# Which tokenizer to use? TweetTokenizer is more robust than the vanilla tokenizer, but then,
# will the intelligence of tokenization matter in the long run when trained using DL?
from nltk.tokenize import word_tokenize, TweetTokenizer
tokenizer = TweetTokenizer(preserve_case = False)

from gensim.models import Word2Vec, KeyedVectors

In [2]:
base_path = "../data/"
cornell_folder = os.path.join(base_path, "cornell movie-dialogs corpus")

In [4]:
movie_lines = os.path.join(cornell_folder, "movie_lines.txt")
movie_lines

'../data/cornell movie-dialogs corpus/movie_lines.txt'

In [5]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]
movie_lines = pd.read_csv(movie_lines, sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_lines_features)

# Using only the required columns, namely, "LineID" and "Line"
movie_lines = movie_lines[["LineID", "Line"]]

# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)

In [7]:
movie_lines.head()

Unnamed: 0,LineID,Line
0,L1045,They do not!
1,L1044,They do to!
2,L985,I hope so.
3,L984,She okay?
4,L925,Let's go.


In [11]:
movie_conversations_file = os.path.join(cornell_folder, "movie_conversations.txt")

In [12]:
movie_conversations_features = ["Character1", "Character2", "Movie", "Conversation"]
movie_conversations = pd.read_csv(movie_conversations_file, sep = "\+\+\+\$\+\+\+", engine = "python", index_col = False, names = movie_conversations_features)

# Again using the required feature, "Conversation"
movie_conversations = movie_conversations["Conversation"]

In [13]:
movie_conversations.head()

0     ['L194', 'L195', 'L196', 'L197']
1                     ['L198', 'L199']
2     ['L200', 'L201', 'L202', 'L203']
3             ['L204', 'L205', 'L206']
4                     ['L207', 'L208']
Name: Conversation, dtype: object

In [23]:
conversation = [[str(list(movie_lines.loc[movie_lines["LineID"] == u.strip().strip("'"), "Line"])[0]).strip() 
                 for u in c.strip().strip('[').strip(']').split(',')] for c in tqdm(movie_conversations)]

100%|██████████| 83097/83097 [1:41:36<00:00, 13.63it/s]  


In [22]:
conversation[3]

['Why?',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 "That's a shame."]

In [26]:
import pickle as pkl
with open("conversations.pkl", "wb") as handle:
    pkl.dump(conversation, handle)

In [36]:
with open("conversations.pkl", "rb") as handle:
    conversation = pkl.load(handle)

In [37]:
len(conversation)

83097

In [38]:
# Calculate the dialogue length statistics

dialogue_lengths = [len(dialogue) for dialogue in conversation]
pd.Series(dialogue_lengths).describe()

count    83097.000000
mean         3.666955
std          2.891798
min          2.000000
25%          2.000000
50%          3.000000
75%          4.000000
max         89.000000
dtype: float64

In [40]:
import random
# Generate 50 sample pairs - 14/03/2019
indices = random.sample(range(len(conversation)), 50)
sample_context_list = []
sample_response_list = []

for index in indices:
    
    response = conversation[index][-1]
        
    context = "FS: " + conversation[index][0] + "\n"
    for i in range(1, len(conversation[index]) - 1):
        
        if i % 2 == 0:
            prefix = "FS: "
        else:
            prefix = "SS: "
            
        context += prefix + conversation[index][i] + "\n"
        
    sample_context_list.append(context)
    sample_response_list.append(response)

with open("cornell_movie_dialogue_sample.csv", "w") as handle:
    for c, r in zip(sample_context_list, sample_response_list):
        handle.write('"' + c + '"' + "#" + r + "\n")

In [57]:
def generate_pairs(conversation):
    
    context_list = []
    response_list = []
    
    for dialogue in tqdm(conversation):
        
        response = dialogue[-1]
        
        context = dialogue[0]
        for index in range(1, len(dialogue) - 1):
            context += dialogue[index]
        
        context_list.append(context)
        response_list.append(response)
        
    return context_list, response_list

In [58]:
context_list, response_list = generate_pairs(conversation)

100%|██████████| 83097/83097 [00:00<00:00, 425083.55it/s]


In [48]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alditopalli/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [71]:
input_docs = context_list
target_docs = response_list

In [72]:
with open("input_docs.pkl", "wb") as handle:
    pkl.dump(input_docs, handle)

In [73]:
with open("target_docs.pkl", "wb") as handle:
    pkl.dump(target_docs, handle)

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=2)
texts = ["my name is aldi", "my name is john", "my name is albert"]
tokenizer.num_words = 2
tokenizer.fit_on_texts(texts)
tokenizer.word_index

{'my': 1, 'name': 2, 'is': 3, 'aldi': 4, 'john': 5, 'albert': 6}

In [7]:
tokenizer.num_words = 4
tokenizer.texts_to_sequences(texts)

[[1, 2, 3], [1, 2, 3], [1, 2, 3]]

In [5]:
tokenizer.word_index

{'my': 1, 'name': 2, 'is': 3, 'aldi': 4, 'john': 5, 'albert': 6}

In [8]:
tokenizer.texts_to_sequences(texts)

[[1, 2, 3], [1, 2, 3], [1, 2, 3]]