In [1]:
import numpy as np
import tensorflow as tf
import re
import time
import os

#### Setting up working directory information

In [2]:
working_directory = os.getcwd()
data_directory = os.path.join(working_directory, 'data')

#### Load the data

In [4]:
lines = open(os.path.join(data_directory, 'movie_lines.txt'), encoding='utf-8', errors='ignore').read().split('\n')
conversations = open(os.path.join(data_directory, 'movie_conversations.txt'), encoding='utf-8', errors='ignore').read().split('\n')

In [42]:
data_separator = " +++$+++ "

In [43]:
# visualize top five lines
lines[0:5]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."]

In [44]:
# Map ids to the line said by character
id_to_line = {}
for line in lines:
    _line = line.split(data_separator)
    if len(_line) == 5:
        id_to_line[_line[0]] = _line[-1].strip()

In [45]:
# Visualize top 5 conversations
conversations[0:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]

In [46]:
junk_characters = r"['\s\[\]]"
conversations_ids = [re.sub(junk_characters, "", conv.split(data_separator)[-1]).split(",") for  conv in conversations[:-1]]

In [47]:
conversations_ids[0]

['L194', 'L195', 'L196', 'L197']

#### Create Questions and Answers data
In conversations_ids, the list represents [Q, A, Q, A...] - Use this structure to create QnA data

In [48]:
questions = []
answers = []

for conv in conversations_ids:
    for i in range(len(conv)-1):
        questions.append(id_to_line[conv[i]])
        answers.append(id_to_line[conv[i+1]])

##### Clean the text

In [68]:
# common contractions
common_contractions = {
    r"i'm": "i am",
    r"he's": "he is",
    r"she's": "she is",
    r"that's": "that is",
    r"what's": "what is",
    r"where's": "where is",
    r"\'ll": " will",
    r"\'ve": " have",
    r"\'re": " are",
    r"\'d": " would",
    r"won't": "will not",
    r"can't": "can not",
    r"&": "and",
    r"[$()\"#/@;:<>{}+=-`|.?,\'*%_\[\]]|(-)+": ""
}

In [69]:
def clean_text(text):
    # lowercase
    text = str(text).lower()
    
    # replace common contractions
    for contraction, replacement in common_contractions.items():
        text = re.sub(contraction, replacement, text)
    return text