In [0]:
# Start with loading all necessary libraries
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.callbacks import TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import regularizers
from keras.layers import Dense, Embedding, LSTM, Conv1D, Conv2D, MaxPooling1D, Reshape, Flatten, Dropout, CuDNNLSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import operator



import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

#from appos import appos

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
% matplotlib inline


Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Loading, extracting and pre-processing the data

Load the data and extract the relevant labels.

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# data = pd.read_csv('Reviews.csv')
# data = data[['Text', 'Score']]

[?25l[K     |▎                               | 10kB 14.0MB/s eta 0:00:01[K     |▋                               | 20kB 1.5MB/s eta 0:00:01[K     |█                               | 30kB 2.3MB/s eta 0:00:01[K     |█▎                              | 40kB 1.6MB/s eta 0:00:01[K     |█▋                              | 51kB 2.0MB/s eta 0:00:01[K     |██                              | 61kB 2.4MB/s eta 0:00:01[K     |██▎                             | 71kB 2.7MB/s eta 0:00:01[K     |██▋                             | 81kB 2.2MB/s eta 0:00:01[K     |███                             | 92kB 2.4MB/s eta 0:00:01[K     |███▎                            | 102kB 2.7MB/s eta 0:00:01[K     |███▋                            | 112kB 2.7MB/s eta 0:00:01[K     |████                            | 122kB 2.7MB/s eta 0:00:01[K     |████▎                           | 133kB 2.7MB/s eta 0:00:01[K     |████▋                           | 143kB 2.7MB/s eta 0:00:01[K     |█████                     

In [0]:
link = 'https://drive.google.com/open?id=1ZfgsDimXEZ7fl5rg66Lud1-mzymldivB'  #for getting dataset from google drive 


fluff, id = link.split('=')
print (id) # Verify that you have everything after '=


downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Reviews.csv')  
data = pd.read_csv('Reviews.csv')
data = data[['Text', 'Score']]

# Dataset is now stored in a Pandas Dataframe

1ZfgsDimXEZ7fl5rg66Lud1-mzymldivB


#### Convert the ratings to the sentiments negative and positive.
Each sentiment will be represented by integers 0 or 1. 

0: negative

1: positive


The ratings and sentiments will be mapped as follows.

Rating 1-3 -> negative (0)

Rating 4-5 -> positive (1)

In [0]:
# Change from 1-5 ratings to negative or positive sentiment
rating_to_sentiment = { 1: 0, 2: 0, 3: 0, 4: 1, 5: 1 }
data['Sentiment'] = data['Score'].apply(lambda x: rating_to_sentiment[x])

# Count number of negative and positive reviews
neg_num = pd.value_counts(data['Sentiment'])[0]
pos_num = pd.value_counts(data['Sentiment'])[1]

print('# negative reviews before: {}'.format(neg_num))
print('# positive reviews before: {}'.format(pos_num))

# Make the data set balanced
balanced_sample_num = np.min([neg_num, pos_num])

# Picks <'balanced_sample_num'> numbers of negative and positive reviews at random
data = (data.groupby('Sentiment', as_index = False)
        .apply(lambda x: x.sample(n = balanced_sample_num))
        .reset_index(drop = True))

# Shuffle the rows so that 0's and 1's are mixed
data = data.sample(frac = 1).reset_index(drop = True)

print('\n# negative reviews after: {}'.format(pd.value_counts(data['Sentiment'])[0]))
print('# positive reviews after: {}'.format(pd.value_counts(data['Sentiment'])[1]))

# Get one-hot encoding for the labels
Y = pd.get_dummies(data['Sentiment']).values

# negative reviews before: 124677
# positive reviews before: 443777

# negative reviews after: 124677
# positive reviews after: 124677


#### Perform pre-processing on the data.

In [0]:
# All characters to lower case
data['Text'] = data['Text'].apply(lambda x: x.lower())

# # Convert words with apostrophes to its corresponding words, e.g. "it's" -> "it is"
# data['Text'] = data['Text'].apply(lambda x: x.split())
# data['Text'] = data['Text'].apply(lambda x: " ".join([appos[word] if word in appos else word for word in x]))

# Remove html-tags, punctuation, commas, numbers etc
data['Text'] = data['Text'].apply((lambda x: re.sub('<[^<]+?>', ' ', x)))
data['Text'] = data['Text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', ' ', x)))
data['Text'] = data['Text'].apply((lambda x: re.sub('^\d+\s|\s\d+\s|\s\d+$', ' ', x)))

# Convert text into tokens, in this case sentences into words
data['Text'] = data.apply(lambda x: word_tokenize(x['Text']), axis = 1)

# Remove most commonly occuring words which are not relevant in the context of the data
irrelevant_words = stopwords.words('english')
data['Text'] = data['Text'].apply(lambda x: [word for word in x if word not in irrelevant_words])

# Find the base form of the word (lemmatization)
lemma = WordNetLemmatizer()
data['Text'] = data['Text'].apply(lambda x: " ".join([lemma.lemmatize(word) for word in x]))

# Vectorize the text by turning each review into a sequence of integers (each integer being the index of a token in a dictionary)
# Also, pad so that every review has the same length
num_top_words = 10000
tokenizer = Tokenizer(num_words = num_top_words, split = ' ')
tokenizer.fit_on_texts(data['Text'].values)
X = tokenizer.texts_to_sequences(data['Text'].values)
X = pad_sequences(X)

#### Print the the <'num_top_words'> number of most frequent words.

In [0]:
top_words_frequency = dict(sorted(tokenizer.word_counts.items(), 
                         key = operator.itemgetter(1), 
                         reverse = True)[:num_top_words])

print(top_words_frequency)



In [0]:
test_size, val_size = 0.1, 0.05

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size)

print('X train: {}, Y train: {}'.format(X_train.shape, Y_train.shape))
print('X test: {}, Y test: {}'.format(X_test.shape, Y_test.shape))

X train: (224418, 1830), Y train: (224418, 2)
X test: (24936, 1830), Y test: (24936, 2)


## Defining different models

In [0]:
def lstm_model(num_top_words, input_length):
    model = Sequential()
    model.add(Embedding(input_dim = num_top_words, output_dim = 128, input_length = input_length))
    model.add(CuDNNLSTM(100))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

def cnn_model(num_top_words, input_length):
    model = Sequential()
    model.add(Embedding(input_dim = num_top_words, output_dim = 128, input_length = input_length))
    model.add(Dropout(0.5))
    model.add(Conv1D(128, kernel_size = 10, input_shape = (input_length, num_top_words), activation = 'relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(MaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(2, activation = 'softmax', kernel_regularizer = regularizers.l2(0.01)))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

def cnn_lstm_model(num_top_words, input_length):
    model = Sequential()
    model.add(Embedding(input_dim = num_top_words, output_dim = 128, input_length = input_length))
    model.add(Conv1D(128, kernel_size = 10, input_shape = (input_length, num_top_words), activation = 'relu'))
    model.add(MaxPooling1D())
    model.add(Dropout(0.5))
    model.add(CuDNNLSTM(100))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

def lstm_cnn_model(num_top_words, input_length):
    model = Sequential()
    model.add(Embedding(input_dim = num_top_words, output_dim = 128, input_length = input_length))
    model.add(CuDNNLSTM(100, return_sequences = True))
    model.add(Conv1D(128, kernel_size = 10, activation = 'relu'))
    model.add(MaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(2, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

## Train models

In [0]:
from datetime import datetime

In [0]:
models = { 
           'lstm_model': lstm_model(num_top_words = num_top_words, input_length = X_train.shape[1]), 
           'cnn_model': cnn_model(num_top_words = num_top_words, input_length = X_train.shape[1]),
           'cnn_lstm_model': cnn_lstm_model(num_top_words = num_top_words, input_length = X_train.shape[1]),
           'lstm_cnn_model': lstm_cnn_model(num_top_words = num_top_words, input_length = X_train.shape[1])
         }

batch_size = 16
num_epochs = 3
val_size = 0.2

for name, model in models.items():
    print('Beginning Training', name)
    current_time = datetime.now()

    tb = TensorBoard(log_dir = './logs/' + name)
    model.fit(X_train, 
              Y_train, 
              validation_split = val_size, 
              epochs = num_epochs, 
              batch_size = batch_size,
              callbacks = [tb])
    print("Training ",name," took time ", datetime.now() - current_time,"\n\n")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Beginning Training lstm_model
Instructions for updating:
Use tf.cast instead.
Train on 179534 samples, validate on 44884 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training  lstm_model  took time  0:39:58.503873 


Beginning Training cnn_model
Train on 179534 samples, validate on 44884 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training  cnn_model  took time  0:28:25.514079 


Beginning Training cnn_lstm_model
Train on 179534 samples, validate on 44884 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training  cnn_lstm_model  took time  0:42:37.632809 


Beginning Training lstm_cnn_model
Train on 179534 samples, validate on 44884 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/

In [0]:
print(Y_test)

Y_test_new=Y_test[:,1]

print(Y_test_new)

In [0]:
for name, model in models.items(): 
  # predict probabilities for test set
  yhat_probs = model.predict(X_test, verbose=0)
  # predict crisp classes for test set
  yhat_classes = model.predict_classes(X_test, verbose=0)
  print(name)
  # accuracy: (tp + tn) / (p + n)
  accuracy = accuracy_score(Y_test_new, yhat_classes)
  print('Accuracy: %f' % accuracy)
  # precision tp / (tp + fp)
  precision = precision_score(Y_test_new, yhat_classes)
  print('Precision: %f' % precision)
  # recall: tp / (tp + fn)
  recall = recall_score(Y_test_new, yhat_classes)
  print('Recall: %f' % recall)
  # f1: 2 tp / (2 tp + fp + fn)
  f1 = f1_score(Y_test_new, yhat_classes)
  print('F1 score: %f \n\n' % f1)

lstm_model
Accuracy: 0.871671
Precision: 0.862508
Recall: 0.883328
F1 score: 0.872794 


cnn_model
Accuracy: 0.854227
Precision: 0.847247
Recall: 0.863132
F1 score: 0.855116 


cnn_lstm_model
Accuracy: 0.870509
Precision: 0.868875
Recall: 0.871741
F1 score: 0.870306 


lstm_cnn_model
Accuracy: 0.865135
Precision: 0.868826
Recall: 0.859108
F1 score: 0.863940 


