In [63]:
import re

import numpy as np
import pandas as pd
import keras
import sklearn
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gensim

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/disk2/speed/bernardoabreu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## LOAD DATA

### Load movie

In [25]:
filename = '../changes/10thingsihateaboutyou.txt'

with open(filename, 'r') as f:
    movie = [(line[1],line[6:-2]) for line in f]

movie = [t if t[0] not in ('P, E') else (t[0],t[1][1:-1]) for t in movie]
movie

[('A',
  "padua high school day welcome to padua high school,, your typical urban suburban high school in portland, oregon. smarties, skids, preppies, granolas. loners, lovers, the in and the out crowd rub sleep out of their eyes and head for the main building. padua high parking lot day kat stratford, eighteen, pretty but trying hard not to be in a baggy granny dress and glasses, balances a cup of coffee and a backpack as she climbs out of her battered, baby blue '75 dodge dart. a stray skateboard clips her, causing her to stumble and spill her coffee, as well as the contents of her backpack. the young rider dashes over to help, trembling when he sees who his board has hit."),
 ('C', 'rider'),
 ('D', 'hey sorry.'),
 ('A', 'cowering in fear, he attempts to scoop up her scattered belongings.'),
 ('C', 'kat'),
 ('D', 'leave it'),
 ('A', 'he persists.'),
 ('C', 'kat '),
 ('E', 'continuing'),
 ('D', 'i said, leave it!'),
 ('A',
  'she grabs his skateboard and uses it to shove him against a

##### Tokenize each sentence

In [26]:
movie = [(t[0], nltk.tokenize.word_tokenize(t[1])) for t in movie]
movie

[('A',
  ['padua',
   'high',
   'school',
   'day',
   'welcome',
   'to',
   'padua',
   'high',
   'school',
   ',',
   ',',
   'your',
   'typical',
   'urban',
   'suburban',
   'high',
   'school',
   'in',
   'portland',
   ',',
   'oregon',
   '.',
   'smarties',
   ',',
   'skids',
   ',',
   'preppies',
   ',',
   'granolas',
   '.',
   'loners',
   ',',
   'lovers',
   ',',
   'the',
   'in',
   'and',
   'the',
   'out',
   'crowd',
   'rub',
   'sleep',
   'out',
   'of',
   'their',
   'eyes',
   'and',
   'head',
   'for',
   'the',
   'main',
   'building',
   '.',
   'padua',
   'high',
   'parking',
   'lot',
   'day',
   'kat',
   'stratford',
   ',',
   'eighteen',
   ',',
   'pretty',
   'but',
   'trying',
   'hard',
   'not',
   'to',
   'be',
   'in',
   'a',
   'baggy',
   'granny',
   'dress',
   'and',
   'glasses',
   ',',
   'balances',
   'a',
   'cup',
   'of',
   'coffee',
   'and',
   'a',
   'backpack',
   'as',
   'she',
   'climbs',
   'out',
   'of'

## Load word2vec

In [5]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format("../data/processed/worde2vec.6b.300d.txt")

##### Split movie and get list of tags

In [49]:
tokens = []
tags = []
for tag, seq in movie:
    block = [word for word in seq if word in w2v_model.wv.vocab]
    tokens.extend(block)
    tags.extend(([tag]*len(block)))

tokens

  after removing the cwd from sys.path.


['padua',
 'high',
 'school',
 'day',
 'welcome',
 'to',
 'padua',
 'high',
 'school',
 ',',
 ',',
 'your',
 'typical',
 'urban',
 'suburban',
 'high',
 'school',
 'in',
 'portland',
 ',',
 'oregon',
 '.',
 'smarties',
 ',',
 'skids',
 ',',
 'preppies',
 ',',
 '.',
 'loners',
 ',',
 'lovers',
 ',',
 'the',
 'in',
 'and',
 'the',
 'out',
 'crowd',
 'rub',
 'sleep',
 'out',
 'of',
 'their',
 'eyes',
 'and',
 'head',
 'for',
 'the',
 'main',
 'building',
 '.',
 'padua',
 'high',
 'parking',
 'lot',
 'day',
 'kat',
 'stratford',
 ',',
 'eighteen',
 ',',
 'pretty',
 'but',
 'trying',
 'hard',
 'not',
 'to',
 'be',
 'in',
 'a',
 'baggy',
 'granny',
 'dress',
 'and',
 'glasses',
 ',',
 'balances',
 'a',
 'cup',
 'of',
 'coffee',
 'and',
 'a',
 'backpack',
 'as',
 'she',
 'climbs',
 'out',
 'of',
 'her',
 'battered',
 ',',
 'baby',
 'blue',
 "'75",
 'dodge',
 'dart',
 '.',
 'a',
 'stray',
 'skateboard',
 'clips',
 'her',
 ',',
 'causing',
 'her',
 'to',
 'stumble',
 'and',
 'spill',
 'her',
 '

### Organize into sequences of tokens

In [58]:
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 21924


### Save sequences to file

In [None]:
out_filename = 'movie_sequences.txt'
with open(out_filename, 'w') as f:
    f.write('\n'.join(sequences))

### Separate input and output

In [None]:
print('separate into input and output')
# separate into input and output
sequences = array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

## Model Architecture

In [55]:
model = keras.models.Sequential()
model

<keras.engine.sequential.Sequential at 0x7f56ed376080>

### Add Embedding Layer

In [56]:
model.add(
    keras.layers.Embedding(
        input_dim=len(w2v_model.wv.vocab),
        output_dim=300,
        input_length=length         
    )
)

  This is separate from the ipykernel package so we can avoid doing imports until


### Add LSTM Layers

In [None]:
model.add(keras.layers.LSTM(300, return_sequences=True))
model.add(keras.layers.LSTM(300, return_sequences=True))

In [67]:
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']

# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
print(vocab_size)

# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

15
[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]
