In [93]:
import pandas as pd
import gensim
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [94]:
# Load data
main_data = pd.read_csv('/Users/andrewsimon/Desktop/IMDBDataset.csv.zip')
main_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [95]:
# Map sentiments to 1's and zeros
main_data['sentiment'] = main_data['sentiment'].map({'positive':1, 'negative': 0})
main_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [96]:
# Preprocess data for W2v
main_data['review'] = main_data['review'].apply(lambda x: gensim.utils.simple_preprocess(x))
main_data.head()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",1
1,"[wonderful, little, production, br, br, the, f...",1
2,"[thought, this, was, wonderful, way, to, spend...",1
3,"[basically, there, family, where, little, boy,...",0
4,"[petter, mattei, love, in, the, time, of, mone...",1


In [97]:
 main_data = main_data.sample(n=500)
 main_data = main_data.reset_index()

In [98]:
# Load pretrained elmo model
elmo = hub.load('/Users/andrewsimon/Downloads/elmo_3').signatures['default']


In [91]:
for i in range(len(main_data['review'])):
    try:
        embeddings_tensor = elmo(tf.constant(main_data['revew'][i]))['elmo']
        embeddings_tensor = embeddings_tensor.numpy()
        main_data['review'][i] = embeddings_tensor
    except KeyError:
        main_data['review'][i] = np.zeros(300,)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data['review'][i] = np.zeros(300,)


In [99]:
for i in range(len(main_data['review'])):
    sub_list = []
    for j in main_data['review'][i]:
        try:
            embeddings_tensor = elmo(tf.constant([j]))['elmo']
            embeddings_tensor = embeddings_tensor.numpy()
            sub_list.append(embeddings_tensor)
        except KeyError:
            sub_list.append(np.zeros(300,))
    main_data['review'][i] = sub_list

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data['review'][i] = sub_list


In [104]:
# Split train, test data
X_train, X_test, y_train, y_test = train_test_split(main_data['review'], main_data['sentiment'], test_size=0.1, random_state=1516)

In [105]:
# Pass to numpy arrays
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [106]:
# Pad data to remove ragged arrays
training_padded = pad_sequences(X_train, maxlen=120, truncating='post')
testing_padded = pad_sequences(X_test, maxlen=120, truncating='post')

In [107]:
# Create model, evaluate

batch = 32

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', 'Recall', 'AUC', 'Precision', 'FalseNegatives', 'FalsePositives'])
    
num_epochs = 10
model.fit(training_padded, y_train, epochs=num_epochs,batch_size=batch, validation_data=(testing_padded, y_test))
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_4 (Flatten)         (None, 122880)            0         
                                                                 
 dense_8 (Dense)             (None, 6)                 737286    
                                                                 
 dense_9 (Dense)             (None, 1)                 7         
                                                                 
Total params: 737,293
Trainable params: 737,293
Non-trainable params: 0
_________________________________________________________________
