In [306]:
import re

import numpy as np
import pandas as pd
import keras
import sklearn
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import gensim
from nltk.corpus import stopwords
from random import randint

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/bernardoabreu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bernardoabreu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## LOAD DATA

### Load movie

In [378]:
filename = '../changes/10thingsihateaboutyou.txt'

with open(filename, 'r') as f:
    movie = [(line[1],line[6:-2]) for line in f]

movie = [t if t[0] not in ('P, E') else (t[0],t[1][1:-1]) for t in movie]
print(len(movie))

2811


##### Tokenize each sentence

In [379]:
movie_tokens = [(t[0], nltk.tokenize.word_tokenize(t[1])) for t in movie]
# movie_no_stopwords = []
# for t, m_list in movie_tokens:
#     new_movie = (t, [m for m in m_list if m not in set(stopwords.words('english'))])
#     if new_movie[1]:
#         movie_no_stopwords.append(new_movie)
# regextokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
# movie_no_punct = [(t[0], regextokenizer.tokenize(t[1])) for t in movie]

In [382]:
print(movie_tokens[50])
print(len(movie_tokens))

('D', ['so', 'they', 'tell', 'me', '...'])
2811


## Load word2vec

In [4]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format("../data/processed/word2vec.6b.100d.txt")

In [6]:
import pickle
pickle.dump(w2v_model, open('../data/processed/word2vec_model_6b_300.p', 'wb'))

##### Split movie and get list of tags

In [383]:
def split_movies(movie_tokens):
    tokens = []
    tags = []
    for tag, seq in movie_tokens:
        block = [word for word in seq if word in w2v_model.vocab]
        tokens.extend(block)
        tags.extend(([tag]*len(block)))
    return tokens, tags

In [384]:
tokens, tags = split_movies(movie_tokens)

### Organize into sequences of tokens

In [385]:
SENTENCE_LENGTH = 50
sequences = [list(ngram) for ngram in nltk.ngrams(tokens, SENTENCE_LENGTH + 1)]
print('Total Sequences: %d' % len(sequences))
print(sequences[1])

Total Sequences: 21736
['high', 'school', 'day', 'welcome', 'to', 'padua', 'high', 'school', ',', 'your', 'typical', 'urban', 'suburban', 'high', 'school', 'in', 'portland', ',', 'oregon', '.', 'smarties', ',', 'skids', ',', 'preppies', ',', '.', 'loners', ',', 'lovers', ',', 'the', 'in', 'and', 'the', 'out', 'crowd', 'rub', 'sleep', 'out', 'of', 'their', 'eyes', 'and', 'head', 'for', 'the', 'main', 'building', '.', 'padua']


### Save sequences to file

In [386]:
out_filename = 'movie_sequences.txt'
with open(out_filename, 'w') as f:
    f.write('\n'.join([' '.join(line) for line in sequences]))

### Load sequences from file

In [387]:
in_filename = 'movie_sequences.txt'
df_seq = pd.read_csv(in_filename, sep=' ', prefix='X', header=None)
df_seq.rename(columns={'X'+ str(length - 1): 'Y'}, inplace=True)
print(df_seq.shape)

(21736, 51)


In [388]:
df_seq.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X41,X42,X43,X44,X45,X46,X47,X48,X49,Y
0,padua,high,school,day,welcome,to,padua,high,school,",",...,of,their,eyes,and,head,for,the,main,building,.
1,high,school,day,welcome,to,padua,high,school,",",your,...,their,eyes,and,head,for,the,main,building,.,padua
2,school,day,welcome,to,padua,high,school,",",your,typical,...,eyes,and,head,for,the,main,building,.,padua,high
3,day,welcome,to,padua,high,school,",",your,typical,urban,...,and,head,for,the,main,building,.,padua,high,parking
4,welcome,to,padua,high,school,",",your,typical,urban,suburban,...,head,for,the,main,building,.,padua,high,parking,lot


### Integer encode sequences of words

In [389]:
print(df_seq.shape)

(21736, 51)


In [390]:
def word2idx(word):
    return w2v_model.vocab[word].index

In [391]:
df_seq = df_seq.applymap(word2idx)
df_seq.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X41,X42,X43,X44,X45,X46,X47,X48,X49,Y
0,29318,152,164,122,3143,4,29318,152,164,1,...,3,44,2251,5,362,10,0,444,447,2
1,152,164,122,3143,4,29318,152,164,1,392,...,44,2251,5,362,10,0,444,447,2,29318
2,164,122,3143,4,29318,152,164,1,392,3682,...,2251,5,362,10,0,444,447,2,29318,152
3,122,3143,4,29318,152,164,1,392,3682,2227,...,5,362,10,0,444,447,2,29318,152,4625
4,3143,4,29318,152,164,1,392,3682,2227,5151,...,362,10,0,444,447,2,29318,152,4625,530


### Separate input and output

In [392]:
X = df_seq.drop('Y', axis=1)
Y = df_seq['Y']
print(X.shape)
print(Y.shape)
print(len(w2v_model.vocab))

(21736, 50)
(21736,)
400000


## Model Architecture

In [393]:
model = keras.models.Sequential()
model

<keras.engine.sequential.Sequential at 0x7fdc07471ef0>

### Add Embedding Layer

In [394]:
vocab_size, embedding_size = w2v_model.vectors.shape

model.add(
    keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_size,
        input_length=(SENTENCE_LENGTH),
        weights=[w2v_model.vectors]
        
    )
)

### Add LSTM Layers

In [395]:
model.add(keras.layers.LSTM(50, return_sequences=True))
model.add(keras.layers.LSTM(50))
model.add(keras.layers.Dense(50, activation='relu'))
model.add(keras.layers.Dense(vocab_size, activation='softmax'))

In [396]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 100)           40000000  
_________________________________________________________________
lstm_11 (LSTM)               (None, 50, 50)            30200     
_________________________________________________________________
lstm_12 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_11 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_12 (Dense)             (None, 400000)            20400000  
Total params: 60,452,950
Trainable params: 60,452,950
Non-trainable params: 0
_________________________________________________________________
None


In [397]:
print('Compile model')
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

Compile model


In [403]:
print('Begin training')
# fit model
model.fit(X, Y, batch_size=64, epochs=2)

Begin training
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fdc29728fd0>

In [407]:
model.save('model.h5')

In [190]:
def idx2word(idx):
    return w2v_model.index2word[idx]

In [404]:
def generate_seq(model, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
#     print(seed_text)
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = np.array([word2idx(word) for word in in_text])
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict([encoded], verbose=0)
        possibilites
        prob = yhat[0] / yhat[0].sum(0)
        index = np.random.choice(len(w2v_model.vocab), p=prob)
#         # map predicted word index to word
        out_word = idx2word(index)
#         # append to input
        in_text.append(out_word)
        result.append(out_word)
    return ' '.join(result)

In [405]:
seed_text = sequences[randint(0,len(sequences))][:50]
print(seed_text)

['walks', 'her', 'over', 'to', 'the', 'swingset', 'and', 'plops', 'her', 'down', 'in', 'a', 'swing', ',', 'moving', 'her', 'hands', 'to', 'hang', 'onto', 'the', 'chains', '.', 'patrick', 'how', "'s", 'that', '?', 'she', 'sits', 'and', 'looks', 'at', 'him', 'for', 'a', 'moment', 'with', 'a', 'smile', '.', 'then', 'falls', 'over', 'backward', '.', 'patrick', 'jesus', '.', 'you']


In [406]:
generated = generate_seq(model, 50, seed_text[:50], 50)
print(' '.join(seed_text) + ' ' + generated)

walks her over to the swingset and plops her down in a swing , moving her hands to hang onto the chains . patrick how 's that ? she sits and looks at him for a moment with a smile . then falls over backward . patrick jesus . you sarah cameron flood we did the with is favorite . . turns and out 's him comment you think ? never listen like knew christ her as cameron n't does stop her bogey certain look she a backpedal i takes her sister lou today opportunity i let back sassy bianca
