In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from Preprocessing.to_embedding import WordEmbedding
from Preprocessing.data_format import formatting
from Preprocessing.helper_functions import import_embedding, embedding_matrix_word2vec
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
data = formatting("phase1_movie_reviews-train.csv")

y = pd.get_dummies(data['polarity'])
X_train, X_dev, y_train, y_dev = train_test_split(data['reviewText'], y, test_size = 0.10, random_state=42)

embedding_size = 300 #number of feature weights in embeddings
max_len = 400

In [3]:
data.head()

Unnamed: 0,polarity,summary,reviewText,year
0,negative,"[bruce, lee, the, legend, baaaaaad]","[this, was, a, horrible, movie, thats, all, i,...",2000
1,positive,"[stylish, yet, uneven, film, at, an, affordabl...","[as, a, lover, of, certian, genres, such, as, ...",2001
2,positive,"[masterful, and, commanding]","[master, and, commander, the, far, side, of, t...",2003
3,positive,"[great, special, effects, disappointed, with, ...","[what's, the, 411, on, this, movie, i'm, an, a...",2009
4,positive,"[bevare, bevare, dracula, is, suspect, here]","[when, i, bought, my, set, i, went, to, my, lo...",2004


In [4]:
embedding = WordEmbedding(num_features = embedding_size)

WordEmbedding.fit(embedding, X_train)
WordEmbedding.size(embedding)

Total number of words in the vocabulary:  (52260, 300)


In [5]:
#Save word embedding to dataframe
#train_embeddings = WordEmbedding.to_pd(embedding, X_train)

#Save Save embeddings to file
WordEmbedding.to_file(embedding)

In [6]:
embeddings_index = import_embedding('trained_embedding_word2vec.txt')

## 2. Vectorize text data

In [7]:
#Basic Vectorization of data
#Review data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

def vectorize(data, tokenizer ,max_len):
    sequences = tokenizer.texts_to_sequences(data)
    padding = pad_sequences(sequences, maxlen = max_len)
    
    return padding

X_train = vectorize(X_train, tokenizer , max_len)
X_dev = vectorize(X_dev, tokenizer, max_len)

print('Found %s unique tokens.' % len(word_index))
print('Shape of train tensor', X_train.shape)
print('Shape of dev tensor', X_dev.shape)

Found 179993 unique tokens.
Shape of train tensor (81000, 400)
Shape of dev tensor (9000, 400)


## 3. Create word vectors with the loaded word2vec model

In [8]:
embedding_matrix, num_words = embedding_matrix_word2vec(word_index, embedding_size, embeddings_index)

### Check train/dev sets

In [9]:
print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test:', X_dev.shape)
print('Shape of y_test:', y_dev.shape)

Shape of X_train: (81000, 400)
Shape of y_train: (81000, 2)
Shape of X_test: (9000, 400)
Shape of y_test: (9000, 2)


## 5. Define model

In [14]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, CuDNNLSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

# Define Model
model = Sequential()
model.add(Embedding(num_words, 
                    embedding_size,
                    input_length = max_len,
                     dropout=0.2))
model.add(Bidirectional(LSTM(128, return_sequences = True))) #CHANGE LSTM to CuDNNLSTM if Cude is available!
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(2, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  # This is added back by InteractiveShellApp.init_path()


In [15]:
history = model.fit(X_train, y_train, batch_size = 256, epochs = 4, validation_data = (X_dev, y_dev), verbose = 1)

Train on 81000 samples, validate on 9000 samples
Epoch 1/4


InvalidArgumentError: No OpKernel was registered to support Op 'CudnnRNN' used by node bidirectional_1/CudnnRNN (defined at /Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/cudnn_recurrent.py:517) with these attrs: [seed=87654321, dropout=0, input_mode="linear_input", T=DT_FLOAT, direction="unidirectional", rnn_mode="lstm", seed2=0, is_training=true]
Registered devices: [CPU]
Registered kernels:
  <no registered kernels>

	 [[node bidirectional_1/CudnnRNN (defined at /Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/cudnn_recurrent.py:517) ]]

Caused by op 'bidirectional_1/CudnnRNN', defined at:
  File "/Users/davidmortensen/anaconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/Users/davidmortensen/anaconda3/lib/python3.7/asyncio/base_events.py", line 523, in run_forever
    self._run_once()
  File "/Users/davidmortensen/anaconda3/lib/python3.7/asyncio/base_events.py", line 1758, in _run_once
    handle._run()
  File "/Users/davidmortensen/anaconda3/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-e2e64ff80ffd>", line 12, in <module>
    model.add(Bidirectional(CuDNNLSTM(128, return_sequences = True)))
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/engine/sequential.py", line 181, in add
    output_tensor = layer(self.outputs[0])
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/wrappers.py", line 427, in __call__
    return super(Bidirectional, self).__call__(inputs, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/engine/base_layer.py", line 457, in __call__
    output = self.call(inputs, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/wrappers.py", line 522, in call
    y = self.forward_layer.call(inputs, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/cudnn_recurrent.py", line 90, in call
    output, states = self._process_batch(inputs, initial_state)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/cudnn_recurrent.py", line 517, in _process_batch
    is_training=True)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1636, in __call__
    input_data, input_h, input_c, params, is_training=is_training)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1527, in __call__
    seed=self._seed)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1014, in _cudnn_rnn
    outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/gen_cudnn_rnn_ops.py", line 142, in cudnn_rnn
    seed2=seed2, is_training=is_training, name=name)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
    op_def=op_def)
  File "/Users/davidmortensen/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'CudnnRNN' used by node bidirectional_1/CudnnRNN (defined at /Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/cudnn_recurrent.py:517) with these attrs: [seed=87654321, dropout=0, input_mode="linear_input", T=DT_FLOAT, direction="unidirectional", rnn_mode="lstm", seed2=0, is_training=true]
Registered devices: [CPU]
Registered kernels:
  <no registered kernels>

	 [[node bidirectional_1/CudnnRNN (defined at /Users/davidmortensen/anaconda3/lib/python3.7/site-packages/keras/layers/cudnn_recurrent.py:517) ]]


In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 3, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 3, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

In [25]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Development Accuracy:  {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_dev, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Development Accuracy:  0.9918
Testing Accuracy:  0.8826


NameError: name 'plot_history' is not defined

In [None]:
#Don't bother with rest from here! Since we'll switch the dataset :) 

In [134]:
movie_df = pd.read_csv("phase1_movie_reviews-test-hidden.csv")
movie_labels = pd.read_csv("true_labels/true_movie_labels.txt", header=None)
movies_test = pd.concat([movie_df, movie_labels], axis=1).drop('polarity', axis=1).rename(columns={0: "polarity"})
#movies_test.to_csv("movies_test.csv", index = False)

game_df = pd.read_csv("phase1_video_games-test-hidden.csv")
game_labels = pd.read_csv("true_labels/true_game_labels.txt", header=None)
games_test = pd.concat([game_df, game_labels], axis=1).drop('polarity', axis=1).rename(columns={0: "polarity"})
#games_test.to_csv("movies_test.csv", index = False)

In [135]:
from keras.preprocessing.text import text_to_word_sequence

movies_test['reviewText'] = movies_test['reviewText'].astype(str)
movies_test['reviewText'] = movies_test['reviewText'].apply(text_to_word_sequence)

games_test['reviewText'] = games_test['reviewText'].astype(str)
games_test['reviewText'] = games_test['reviewText'].apply(text_to_word_sequence)

In [136]:
movies_X_test = movies_test['reviewText']
movies_y_test = movies_test[['polarity']]

games_X_test = games_test['reviewText']
games_y_test = games_test[['polarity']]

In [137]:
movies_X_test = vectorize(movies_X_test, tokenizer , max_len)
games_X_test = vectorize(games_X_test, tokenizer , max_len)

In [138]:
movies_pred = model.predict_classes(movies_X_test)

games_pred = model.predict_classes(games_X_test)

In [139]:
movies_y_test["polarity"] = movies_y_test["polarity"].str.replace('positive', '1')
movies_y_test["polarity"] = movies_y_test["polarity"].str.replace('negative', '0')
movies_y_test["polarity"] = movies_y_test["polarity"].astype('int64')

games_y_test["polarity"] = games_y_test["polarity"].str.replace('positive', '1')
games_y_test["polarity"] = games_y_test["polarity"].str.replace('negative', '0')
games_y_test["polarity"] = games_y_test["polarity"].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [140]:
from sklearn.metrics import accuracy_score

print(accuracy_score(movies_y_test, movies_pred))
print(accuracy_score(games_y_test, games_pred))

0.881
0.851527764639107
