In [122]:
import numpy as np
import tensorflow as tf
import re
import time
import os
from collections import defaultdict
from joblib import dump

#### Setting up working directory information

In [2]:
working_directory = os.getcwd()
data_directory = os.path.join(working_directory, 'data')

#### Load the data

In [4]:
lines = open(os.path.join(data_directory, 'movie_lines.txt'), encoding='utf-8', errors='ignore').read().split('\n')
conversations = open(os.path.join(data_directory, 'movie_conversations.txt'), encoding='utf-8', errors='ignore').read().split('\n')

In [42]:
data_separator = " +++$+++ "

In [43]:
# visualize top five lines
lines[0:5]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."]

In [44]:
# Map ids to the line said by character
id_to_line = {}
for line in lines:
    _line = line.split(data_separator)
    if len(_line) == 5:
        id_to_line[_line[0]] = _line[-1].strip()

In [45]:
# Visualize top 5 conversations
conversations[0:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]

In [46]:
junk_characters = r"['\s\[\]]"
conversations_ids = [re.sub(junk_characters, "", conv.split(data_separator)[-1]).split(",") for  conv in conversations[:-1]]

In [47]:
conversations_ids[0]

['L194', 'L195', 'L196', 'L197']

#### Create Questions and Answers data
In conversations_ids, the list represents [Q, A, Q, A...] - Use this structure to create QnA data

In [48]:
questions = []
answers = []

for conv in conversations_ids:
    for i in range(len(conv)-1):
        questions.append(id_to_line[conv[i]])
        answers.append(id_to_line[conv[i+1]])

##### Clean the text

In [68]:
# common contractions
common_contractions = {
    r"i'm": "i am",
    r"he's": "he is",
    r"she's": "she is",
    r"that's": "that is",
    r"what's": "what is",
    r"where's": "where is",
    r"\'ll": " will",
    r"\'ve": " have",
    r"\'re": " are",
    r"\'d": " would",
    r"won't": "will not",
    r"can't": "can not",
    r"&": "and",
    r"[$()\"#/@;:<>{}+=-`|.?,\'*%_\[\]]|(-)+": ""
}

In [69]:
def clean_text(text):
    # lowercase
    text = str(text).lower()
    
    # replace common contractions
    for contraction, replacement in common_contractions.items():
        text = re.sub(contraction, replacement, text)
    return text

In [71]:
cleaned_questions = [clean_text(q) for q in questions]
cleaned_answers = [clean_text(a) for a in answers]

###### Removing infrequent words

In [104]:
word_counts = defaultdict(int)
max_word_count = 1
for q,a in zip(cleaned_questions, cleaned_answers):
    for word in str(q+a).split():
        word_counts[word] += 1
        if word_counts[word] > max_word_count:
            max_word_count = word_counts[word]

In [105]:
thresh_lower = 20 # Approx 5%
thresh_upper = 500000

###### Create word-index maps

In [106]:
word_to_idx_map = {}

idx = 0
for word, count in word_counts.items():
    if thresh_lower<=count<=thresh_upper:
        word_to_idx_map[word] = idx
        idx+=1

In [107]:
additional_tokens = ["<PAD>", "<EOS>", "<UNK>", "<SOS>"] # Padding, End-of-string, Unknown-filtered out by threshold, Start-of-string
for token in additional_tokens:
    word_to_idx_map[token] = len(word_to_idx_map)+1

In [108]:
idx_to_word_map = {idx:word for word,idx in word_to_idx_map.items()}

##### Add EOS token to all answers

In [109]:
cleaned_answers_with_eos = [a+" <EOS>" for a in cleaned_answers]

##### Convert questions and answers words to indexes

In [113]:
all_words = word_to_idx_map.keys()
questions_to_index = [[word_to_idx_map[w] if w in all_words else word_to_idx_map["<UNK>"] for w in q.split()] for q in cleaned_questions]
answers_to_index = [[word_to_idx_map[w] if w in all_words else word_to_idx_map["<UNK>"] for w in a.split()] for a in cleaned_answers_with_eos]

###### Truncate questions longer than threshold

In [119]:
MAX_LEN = 30
sorted_questions_idx = []
sorted_answers_idx = []
sorted_questions = []
sorted_answers = []

for length in range(1,MAX_LEN):
    for i,q in enumerate(questions_to_index):
        if len(q)==length:
            sorted_questions_idx.append(q)
            sorted_answers_idx.append(answers_to_index[i])
            sorted_questions.append(cleaned_questions[i])
            sorted_answers.append(cleaned_answers_with_eos[i])

In [None]:
dump(sorted_questions_idx, os.path.join(data_directory, "questions_to_idx.h5"))