In [1]:
import pandas as pd
from pandas import Series, DataFrame
from IPython.display import display, HTML
import numpy as np

# Reading processed data

In [2]:
delimiter='$'
columns = ['first_char_id', 'second_char_id', 'movie_id', 'source_sentence', 'target_sentence']
source_df = pd.read_csv('../../data/cornell_corpus/clean_data.csv',delimiter=delimiter,names=columns,skiprows=[0])

In [3]:
display(source_df)

Unnamed: 0,first_char_id,second_char_id,movie_id,source_sentence,target_sentence
0,u0,u2,m0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,u0,u2,m0,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,u0,u2,m0,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,u0,u2,m0,You're asking me out. That's so cute. What's ...,Forget it.
4,u0,u2,m0,"No, no, it's my fault -- we didn't have a prop...",Cameron.
5,u0,u2,m0,Cameron.,"The thing is, Cameron -- I'm at the mercy of a..."
6,u0,u2,m0,"The thing is, Cameron -- I'm at the mercy of a...",Seems like she could get a date easy enough...
7,u0,u2,m0,Why?,Unsolved mystery. She used to be really popul...
8,u0,u2,m0,Unsolved mystery. She used to be really popul...,That's a shame.
9,u0,u2,m0,"Gosh, if only we could find Kat a boyfriend...",Let me see what I can do.


# Sentence Preprocessing

## Get word dictionary

In [4]:
import string
# This function is used to remove punctuation, espace in each sentence, and do the tokenization
def clean_and_tokenization(sentence):
    if not sentence or sentence == '':
        return
    
    clean_sentence = "".join(char for char in str(sentence).strip().lower() if char not in string.punctuation)
    
    return clean_sentence.split(' ')


# Unit test
sentence = "   Choose your targets men. That's right Watch for you good!!   "
print(clean_and_tokenization(sentence))

['choose', 'your', 'targets', 'men', 'thats', 'right', 'watch', 'for', 'you', 'good']


In [5]:
# Now we use this function to get word list first

word_list = []
for row in source_df.iterrows():
    try: 
        series = row[1]
        
        source_sentence = series['source_sentence']
        target_sentence = series['target_sentence']
        
        source_words = clean_and_tokenization(source_sentence)
        target_words = clean_and_tokenization(target_sentence)
        
        word_list.extend(source_words)
        word_list.extend(target_words)
        
    except Exception as e:
        print(e)
        pass

    

In [6]:
word_set = set(word_list)

In [7]:
# To know the metrics

print("Total words: {}".format(len(word_list)))
print("Unique words: {}".format(len(word_set)))

Total words: 4749484
Unique words: 66501


In [8]:
# Create word to int dic and int to word dic

word_to_int={}
int_to_word={}
index = 1

for word in word_set:
    word_to_int[word] = index
    int_to_word[str(index)] = word
    
    index = index+1


### Quick test

print(word_to_int['stay'])
print(int_to_word['25'])

15853
fare


In [9]:
# Convert the word list to index list using the word_to_int dictionary

word_index_list = [word_to_int[word] for word in word_list]

## Subsampling

### Some words like 'a', 'the', 'this' has no significant meaning, should remove them from word list in order to gain a better preformance

In [10]:
from collections import Counter

threshold=1e-5
word_counts = Counter(word_index_list)
total_count = len(word_index_list)

frequence = {word: count/total_count for word, count in word_counts.items()}
drop_prob = {word: 1 - np.sqrt(threshold / frequence[word]) for word in word_counts}

In [13]:
# Use a random to decide if we pick a word into training word
import random

trainning_word_index_list=[word for word in word_index_list if random.random() < (1 - drop_prob[word])]

In [18]:
# An unit test

test_list = trainning_word_index_list[:30]

for index in test_list:
    print(int_to_word[str(index)])

can
roxanne
korrine
andrew
barrett
incredibly
horrendous
quad
thought
wed
start
pronunciation
pronunciation
hacking
gagging
spitting
hacking
gagging
spitting
please
okay
french
cuisine
saturday
night
cute
fault
introduction
cameron
cameron
