In [42]:
import pandas as pd
import gensim
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [43]:
# Load data
main_data = pd.read_csv('/Users/andrewsimon/Desktop/IMDBDataset.csv.zip')
main_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [44]:
# Map sentiments to 1's and zeros
main_data['sentiment'] = main_data['sentiment'].map({'positive':1, 'negative': 0})
main_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [46]:
# Load pretrained elmo model
elmo = hub.load('/Users/andrewsimon/Downloads/elmo_3').signatures['default']


In [47]:
# Sample data
main_data = main_data.sample(n=20)
main_data = main_data.reset_index()
main_data

Unnamed: 0,index,review,sentiment
0,35068,Barney is just awful. As many of the other rev...,0
1,22418,This is a film about deep and unspoken human r...,1
2,7845,Once when I was in college and we had an inter...,1
3,2391,"The time I wasted seeing this movie, I demand ...",0
4,28159,"I had to watch this movie for a film class, I ...",0
5,6767,It's as if the Stay-Puffed Marshmallow Man fro...,0
6,20977,Want a great recipe for failure? Take a crappy...,0
7,10408,Given this film's incredible reviews I was exp...,0
8,2633,"When ""Madame"" decides to let her cats inherit ...",1
9,46234,"-may contain spoilers-<br /><br />Clearly, who...",0


In [48]:
# Split train, test data
X_train, X_test, y_train, y_test = train_test_split(main_data['review'], main_data['sentiment'], test_size=0.2, random_state=1516)

In [49]:
# Pass to numpy arrays
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [50]:
# gather embeddings
input_tensor_train = X_train
input_tensor_test = X_test
embeddings_tensor_train = elmo(tf.constant(input_tensor_train))['elmo']
embeddings_tensor_test = elmo(tf.constant(input_tensor_test))['elmo']

In [51]:
# Convert embeddins to numpy arrays
embeddings_train = embeddings_tensor_train.numpy()
embeddings_test = embeddings_tensor_test.numpy()

In [52]:
# Pad data to remove ragged arrays
training_padded = pad_sequences(embeddings_train, maxlen=120, truncating='post')
testing_padded = pad_sequences(embeddings_tensor_test, maxlen=120, truncating='post')

In [55]:
# Create model, evaluate

batch = 32

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', 'Recall', 'AUC', 'Precision', 'FalseNegatives', 'FalsePositives'])
    
num_epochs = 10
model.fit(training_padded, y_train, epochs=num_epochs,batch_size=batch, validation_data=(testing_padded, y_test))
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_6 (Flatten)         (None, 122880)            0         
                                                                 
 dense_12 (Dense)            (None, 6)                 737286    
                                                                 
 dense_13 (Dense)            (None, 1)                 7         
                                                                 
Total params: 737,293
Trainable params: 737,293
Non-trainable params: 0
_________________________________________________________________
