In [135]:
import os
import re
import string
import unicodedata
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
import inflect

In [126]:
conv_path = os.path.join('datasets', 'movie_conversations.txt')
convs = []
with open(conv_path) as file:
    for row in file:
        one_line = row.split('+++$+++')[-1]
        one_line = one_line.strip()
        one_line = re.sub('[\[\'\]]', '', one_line)
        convs.append(one_line.split(', '))

In [127]:
convs_pairs = []
for row in convs:
    while len(row) > 1:
        convs_pairs.append(row[:2])
        row = row[1:]
convs_pairs = pd.DataFrame(convs_pairs, 
                           columns=['question_code', 'answer_code'])

In [128]:
lines_path = os.path.join('datasets', 'movie_lines.txt')
lines_convs = []
with open(lines_path) as file:
    for row in file:
        one_line = row.split('+++$+++')
        code, text = one_line[0], one_line[-1]
        lines_convs.append([code.strip(), text.strip()])
code_lines = pd.DataFrame(lines_convs, columns=['code', 'text'])

In [129]:
text_df = pd.merge(convs_pairs, code_lines, 
                   left_on='question_code', right_on='code',
                   how='left')
text_df.rename(columns={'text': 'question'},
               inplace=True)
text_df = pd.merge(text_df, code_lines, 
                   left_on='answer_code', right_on='code',
                   how='left')
text_df.rename(columns={'text': 'answer'},
               inplace=True)
text_df.drop(columns=['code_x', 'code_y'], inplace=True)

In [130]:
text_df.head()

Unnamed: 0,question_code,answer_code,question,answer
0,L194,L195,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,L195,L196,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,L196,L197,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,L198,L199,You're asking me out. That's so cute. What's ...,Forget it.
4,L200,L201,"No, no, it's my fault -- we didn't have a prop...",Cameron.


Check out [that](https://gist.github.com/MrEliptik/b3f16179aa2f530781ef8ca9a16499af).

In [137]:
def make_lowercase(text):
    return text.lower()


def remove_urls(text):
    return re.sub(r"http\S+", "", text)


def remove_punctuations(words):
    clean_tokens = [word for word in words if word not in string.punctuation]
    return clean_tokens


def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words


def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words


def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
    

def clean_text(text):
    text = make_lowercase(text)
    text = remove_urls(text)
    words = word_tokenize(text)
    words = replace_numbers(words)
    words = remove_non_ascii(words)
    words = remove_punctuations(words)
    return words

In [None]:
text_df['quest_tokens'] = text_df['question'].apply(clean_text)
text_df['answer_tokens'] = text_df['answer'].apply(clean_text)