In [51]:
import nltk
from nltk import bigrams
from nltk.tokenize import word_tokenize
import pandas as pd

data = pd.read_csv('data/moviedata.csv')

In [52]:
data.head()

Unnamed: 0,movie,character_name,line_num,line
0,Avengers,Nick Fury,0,how bad is it?
1,Avengers,Nick Fury,1,nasa didn't authorize selvig to test phase.
2,Avengers,Nick Fury,2,what are the energy levels now?
3,Avengers,Nick Fury,3,how long to get everyone out?
4,Avengers,Nick Fury,4,do better.


# Pre-Processing Data
To get the data ready for machine learning, we need to tokenize, filter out one-word occurrences, and vectorize the tokens.

## Tokenizing 
Our first step is to tokenize. We will include unigrams *and* bigrams in our set.

In [53]:
data2 = data.dropna().copy(deep=True)

unigrams = data2['line'].copy(deep=True).apply(word_tokenize)
bigrams = data2['line'].copy(deep=True
                    ).apply(word_tokenize
                    ).apply(bigrams
                    ).apply(list
                    ).apply(lambda x: ['_'.join(bigram) for bigram in x])

data2['tokens'] = unigrams + bigrams

In [54]:
data2['tokens'][0]

['how', 'bad', 'is', 'it', '?', 'how_bad', 'bad_is', 'is_it', 'it_?']

## Removing 1-Count Occurrences
Before including the tokens in the final data set, we will need to filter out 1-count occurrences from the unigrams.

In [55]:
# The resulting from the code below is found in 
# data/moviedata_tokens.csv, so you don't have 
# to run the code below. However, if you want to 
# for any reason, uncomment and run!

'''
# flatten the list of unigram tokens into a single list of words
words = [word for token_list in unigrams for word in token_list]

# create frequency distribution of the words
freq_dist = nltk.FreqDist(words)

# Filter out words with a count of 1
uni_filtered_words = [word for word in words if freq_dist[word] > 1]

# Combine filtered unigrams with bigrams
bi_words = [word for token_list in bigrams for word in token_list]
filtered_words = uni_filtered_words + bi_words

# Remove 1-count occurrences from the tokenized text column
data2['tokens_filtered'] = data2['tokens'].apply(
    lambda x: [word for word in x if word in filtered_words])

data2['tokens_filtered'].head()

data3 = data2.drop(columns='tokens').rename(
    columns = {'tokens_filtered': 'tokens'})

data3.to_csv('data/moviedata_tokens.csv', index=False)
'''


0    [how, bad, is, it, ?, how_bad, bad_is, is_it, ...
1    [did, n't, selvig, to, test, phase, ., nasa_di...
2    [what, are, the, energy, levels, now, ?, what_...
3    [how, long, to, get, everyone, out, ?, how_lon...
4                 [do, better, ., do_better, better_.]
Name: tokens_filtered, dtype: object

In [57]:
# Uncomment if you ran the code above and need to
# re-export to csv. (not recommended)

# data3.to_csv('data/moviedata_tokens.csv', index=False)