In [1]:
# importing required packages
import re
import csv
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import callbacks
from keras.preprocessing import sequence
from keras.models import load_model
from keras.utils.vis_utils import plot_model
from keras.utils import np_utils

# fix random seed for reproducibility
np.random.seed(7)

Using TensorFlow backend.


# Load data

In [387]:
#Import labeled and unlabled data
df=pd.read_csv('labeled_data.csv',encoding='utf-8')
test=pd.read_csv('unlabeled_data.csv',encoding='utf-8')

In [53]:
test.head()

Unnamed: 0,text
0,Had a good experience when my wife and I sat a...
1,On my first to Montreal with my gf we came her...
2,One of our favorite places to go when it's col...
3,"The doctor was very nice, got in in a good amo..."
4,The Nook is an immediate phoenix staple! I ca...


In [388]:
#create function to remove punctuation
import re
def clean_phrase(phrase):

    #Remove punctuation (with a regular expression) and convert to lower case
    REPLACE= re.compile("[^a-zA-Z]")
    phrase = [REPLACE.sub(" ", line.lower()) for line in phrase]
    return phrase

In [389]:
dftry=df
testtry=test

In [391]:
#apply function to data
dftry.loc[:,'text']=clean_phrase(dftry.loc[:,'text'])
testtry.loc[:,'text']=clean_phrase(testtry.loc[:,'text'])

#convert 'text' to list
clean_phrase = dftry.text.tolist()
test_clean_phrase =testtry.text.tolist()

In [393]:
#Format data for analysis
all_text=' /n '.join(clean_phrase)
test_all_text=' /n '.join(test_clean_phrase)

reviews=all_text.split(' /n ')
all_text = ' '.join(reviews)

In [395]:
# split each word of the training dataset in the string to a list
words = all_text.split()

In [396]:
#repeat same process as labeled data
test_reviews=test_all_text.split(' /n ')
test_all_text = ' '.join(test_reviews)

test_words=test_all_text.split()

In [398]:
#chech number of reviews on both labeled and unlabled data (train and test data )
print("Train reviews: {}".format(len(reviews)))
print("Test reviews: {}".format(len(test_reviews)))

Train reviews: 50000
Test reviews: 600000


In [400]:
#store rating in array
labels=dftry.label.to_numpy()

In [401]:
labels

array([4, 3, 5, ..., 1, 4, 1])

In [406]:
#combine words from labeled and unlabled data
full_words = words + test_words

In [408]:
#create dictionaries that map the words in the vocabulary to integers. 
#Then we can convert each of our reviews into integers so they can be passed into the network.

from collections import Counter
counts = Counter(full_words)
vocab = sorted(counts, key=counts.get, reverse=True)

#Build a dictionary that maps words to integers
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [409]:
#Encode the words with integers. 

reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split( )])
    
test_reviews_ints = []
for eachs in test_reviews:
    test_reviews_ints.append([vocab_to_int[word] for word in eachs.split( )])

In [410]:
#check review lengths 
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 5
Maximum review length: 1033


In [411]:
# check total no. of rows not having zero length reviews
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

49995

In [412]:
# remove zero length reviews
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [413]:
#check again
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 1033


In [414]:
#As maximum review length too many steps for RNN. Let's truncate to 12 steps. 
#For reviews shorter than 12 steps, we'll pad with 0s. For reviews longer than 12 steps,
# we will truncate them to the first 12 characters.

max_review_length = 12
X_train = sequence.pad_sequences(reviews_ints, maxlen=max_review_length)
x_test = sequence.pad_sequences(test_reviews_ints, maxlen=max_review_length)

In [415]:
print(X_train.shape)

(49995, 12)


In [416]:
print(x_test.shape)

(600000, 12)


In [417]:
# check no of unique words in the corpus
# Adding 1 because we use 0's for padding, dictionary started at 1
# this value will be passed to the embedding layer
top_words = len(vocab_to_int) + 1
print(top_words)

177800


In [418]:
# One Hot Encoding the labels
y_train = np_utils.to_categorical(labels, 6)

In [419]:
#chech data shape 
y_train.shape

(49995, 6)

In [420]:
# Creating Callbacks
# ModelCheckpoints is used to save the model after every epoch
# EarlyStopping is used to stop training when the validation loss has not improved after 2 epochs
# Tensorboard is used tovisualize dynamic graphs of the training and test metrics
cbks = [callbacks.ModelCheckpoint(filepath='./checkpoint_model.h5', monitor='val_loss', save_best_only=True),
            callbacks.EarlyStopping(monitor='val_loss', patience=2),callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)]

In [426]:
# Final Model Architecture

# embedding layer size
embedding_vecor_length = 32

model = Sequential()
model.add(Embedding(122427, embedding_vecor_length, input_length=max_review_length, dropout=0.2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
# 1 layer of 100 units in the hidden layers of the LSTM cells
model.add(LSTM(100))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train,validation_split=0.20, epochs=20,verbose=1, batch_size=32,callbacks=cbks)


  import sys


Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 12, 32)            3917664   
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 12, 32)            3104      
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 6, 32)             0         
_________________________________________________________________
lstm_16 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_16 (Dense)             (None, 6)                 606       
Total params: 3,974,574
Trainable params: 3,974,574
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 39996 samples, validate on 9999 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.callbacks.callbacks.History at 0x66d211e80>

In [209]:
model = load_model('checkpoint_model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [210]:
test_pred = model.predict_classes(x_test)