In [19]:
import os
import re
import string
import unicodedata
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
import inflect

In [20]:
conv_path = os.path.join('datasets', 'movie_conversations.txt')
convs = []
with open(conv_path) as file:
    for row in file:
        one_line = row.split('+++$+++')[-1]
        one_line = one_line.strip()
        one_line = re.sub('[\[\'\]]', '', one_line)
        convs.append(one_line.split(', '))

In [21]:
convs_pairs = []
for row in convs:
    while len(row) > 1:
        convs_pairs.append(row[:2])
        row = row[1:]
convs_pairs = pd.DataFrame(convs_pairs, 
                           columns=['question_code', 'answer_code'])

In [22]:
lines_path = os.path.join('datasets', 'movie_lines.txt')
lines_convs = []
with open(lines_path) as file:
    for row in file:
        one_line = row.split('+++$+++')
        code, text = one_line[0], one_line[-1]
        lines_convs.append([code.strip(), text.strip()])
code_lines = pd.DataFrame(lines_convs, columns=['code', 'text'])

In [23]:
text_df = pd.merge(convs_pairs, code_lines, 
                   left_on='question_code', right_on='code',
                   how='left')
text_df.rename(columns={'text': 'question'},
               inplace=True)
text_df = pd.merge(text_df, code_lines, 
                   left_on='answer_code', right_on='code',
                   how='left')
text_df.rename(columns={'text': 'answer'},
               inplace=True)
text_df.drop(columns=['code_x', 'code_y'], inplace=True)

In [24]:
text_df.head()

Unnamed: 0,question_code,answer_code,question,answer
0,L194,L195,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,L195,L196,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,L196,L197,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,L198,L199,You're asking me out. That's so cute. What's ...,Forget it.
4,L200,L201,"No, no, it's my fault -- we didn't have a prop...",Cameron.


Check out [that](https://gist.github.com/MrEliptik/b3f16179aa2f530781ef8ca9a16499af).

In [28]:
def make_lowercase(text):
    return text.lower()


def remove_urls(text):
    return re.sub(r"http\S+", "", text)


def decontracted(text):
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text


def remove_punctuations(words):
    clean_tokens = [word for word in words if word not in string.punctuation]
    return clean_tokens


def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words


def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words


def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
    

def clean_text(text):
    text = make_lowercase(text)
    text = remove_urls(text)
    text = decontracted(text)
    words = word_tokenize(text)
    words = replace_numbers(words)
    words = remove_non_ascii(words)
    words = remove_punctuations(words)
    return ' '.join(words)

In [29]:
text_df['clean_quest'] = text_df['question'].apply(clean_text)
text_df['clean_ans'] = text_df['answer'].apply(clean_text)

In [30]:
text_df.head()

Unnamed: 0,question_code,answer_code,question,answer,clean_quest,clean_ans
0,L194,L195,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,...",can we make this quick roxanne korrine and and...,well i thought we would start with pronunciati...
1,L195,L196,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....,well i thought we would start with pronunciati...,not the hacking and gagging and spitting part ...
2,L196,L197,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,not the hacking and gagging and spitting part ...,okay ... then how 'bout we try out some french...
3,L198,L199,You're asking me out. That's so cute. What's ...,Forget it.,you are asking me out that is so cute what is ...,forget it
4,L200,L201,"No, no, it's my fault -- we didn't have a prop...",Cameron.,no no it is my fault -- we did not have a prop...,cameron


### Saving data

In [31]:
encoder_input = text_df['clean_quest']
decoder_input = text_df['clean_ans']
encoder_input.to_csv('encoder_data.csv', index=False)
decoder_input.to_csv('decoder_data.csv', index=False)