In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import callbacks, models, layers
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
# bag of words

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3)

# Splitting the dataset into training and validation sets
split_size = int(len(train)*0.8)

training_reviews = train.review[:split_size]
training_sentiments = train.sentiment[:split_size]
validation_reviews = train.review[split_size:]
validation_sentiments = train.sentiment[split_size:]

In [3]:
# Text to Sequence and Padding
vocab_size = 10000
embedding_dim = 32
max_length = 200
trunc_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(training_reviews)

training_sequences = tokenizer.texts_to_sequences(training_reviews)
padded_training = pad_sequences(training_sequences, maxlen=max_length)

validation_sequences = tokenizer.texts_to_sequences(validation_reviews)
padded_validation = pad_sequences(validation_sequences, maxlen=max_length)

In [4]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, accuracy=0.90):
        self.accuracy = accuracy
        
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('val_accuracy')>=self.accuracy):
            print(f"\nReached {self.accuracy} accuracy so cancelling training!")
            self.model.stop_training = True

In [5]:
# Building the Neural Network
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])

callbacks = CustomCallback()

model.compile(loss='binary_crossentropy',
             optimizer=tf.optimizers.Adam(learning_rate=0.001),
             metrics=['accuracy'])

model.summary()

model.fit(padded_training, training_sentiments, epochs=50, callbacks=[callbacks], validation_data=(padded_validation, validation_sentiments))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           320000    
_________________________________________________________________
flatten (Flatten)            (None, 6400)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                204832    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 524,865
Trainable params: 524,865
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch

<keras.callbacks.History at 0x10d097957f0>

In [6]:
# Evaluating the neural network model
accuracy_score = model.evaluate(padded_validation, validation_sentiments, verbose=0)[1]
print(f"Accuracy Score: {round(accuracy_score*100, 2)}%")

Accuracy Score: 85.04%


In [7]:
# Making predictions on the test set
testing_sequences = tokenizer.texts_to_sequences(test.review)
padded_testing = pad_sequences(testing_sequences, maxlen=max_length)

sentiment_predictions = list(map(lambda sentiment: 1 if sentiment > 0.5 else 0, model.predict(padded_testing)))
pd.DataFrame({'Predictions':sentiment_predictions}).head(10)

Unnamed: 0,Predictions
0,1
1,0
2,1
3,1
4,1
5,0
6,0
7,0
8,0
9,0


In [8]:
# Positive Review
print(f"Review:\n{test.review[0]}\n\nSentiment: {'Positive' if sentiment_predictions[0] == 1 else 'Negative'}")

Review:
"Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty."

Sentiment: Positive


In [9]:
# Submitting the results
output = pd.DataFrame(data={"id":test.id, 
                            "sentiment":sentiment_predictions})

output.to_csv("submission.csv", index=False, quoting=3)