In [49]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import pickle
import string


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16514\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16514\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import the dataset

In [50]:
imdb_data = pd.read_csv('./data/IMDB Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Generate the Training and Testing data by dividing the dataset as 4:1

In [51]:
train_reviews=imdb_data.review[:30000]
train_sentiments=imdb_data.sentiment[:30000]

val_reviews=imdb_data.review[30000:40000]
val_sentiments=imdb_data.sentiment[30000:40000]
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]



In [52]:
def preprocess(reviews):
    container = []
    for review in reviews:
        review = review.replace("<br />", "")
        review = review.lower()
        for ele in string.punctuation:
                if ele in review:
                        review = review.replace(ele, "")
        container.append(review)
    return container

def preprocess_sentiment(sentiments):
    container = []
    for sentiment in sentiments:
        container.append(sentiment)
    return container

In [53]:
train_reviews=preprocess(train_reviews)
train_sentiments=preprocess_sentiment(train_sentiments)
val_reviews=preprocess(val_reviews)
val_sentiments=preprocess_sentiment(val_sentiments)
test_reviews=preprocess(test_reviews)
test_sentiments=preprocess_sentiment(test_sentiments)

# fastText method

In [54]:
import fasttext

In [55]:
with open("data/train.txt", "w") as f:
    for review in train_reviews:
        f.write(review)
        f.write("\n")
with open("data/val.txt", "w") as f:
    for review in val_reviews:
        f.write(review)
        f.write("\n")
with open("data/test.txt", "w") as f:
    for review in test_reviews:
        f.write(review)
        f.write("\n")

# Fasttext manually modify the sentence embedding method

In [56]:
# model = fasttext.train_unsupervised("data/train.txt")
# model.save_model("data/train.bin")
model = fasttext.load_model("data/train.bin")

In [57]:
len(model.words)

34303

In [58]:
len(model.get_word_vector("the"))

100

In [59]:
word_dict = pickle.load(open("data/weight.pickle", "rb"))

def transform_sentence_embedding(data, word_dict):
    representations = []
    for sentence in data:
        nltk_tokens = sentence.strip().split()
        sum = 0
        dividor = 0.00001
        for token in nltk_tokens:
            if token in word_dict:
                sum += model.get_word_vector(token) * word_dict[token]
                dividor += word_dict[token]
            else:
                sum += model.get_word_vector(token)
                dividor += 1
        representations.append(sum / dividor)
    return representations

In [60]:
def transform_label(data):
    representations = []
    for label in data:
        if(label == 'negative'):
            representations.append(0)
        else:
            representations.append(1)
    return representations


In [106]:
for k,v in word_dict.items():
    if v < 0.5:
        word_dict[k] = 0
    if v > 0.5 and v < 1.5:
        word_dict[k] = 1
    if v > 1.5:
        word_dict[k] = 2

fasttext_train_reviews = torch.Tensor(np.array(transform_sentence_embedding(train_reviews, word_dict)))
fasttext_val_reviews = torch.Tensor(np.array(transform_sentence_embedding(val_reviews, {})))
fasttext_test_reviews = torch.Tensor(np.array(transform_sentence_embedding(test_reviews, {})))

In [107]:
fasttext_train_sentiments = nn.functional.one_hot(torch.Tensor(np.array(transform_label(train_sentiments))).to(torch.int64)).to(torch.float).cuda()
fasttext_val_sentiments = nn.functional.one_hot(torch.Tensor(np.array(transform_label(val_sentiments))).to(torch.int64)).to(torch.float).cuda()
fasttext_test_sentiments = nn.functional.one_hot(torch.Tensor(np.array(transform_label(test_sentiments))).to(torch.int64)).to(torch.float).cuda()

In [108]:
fasttext_train_reviews = fasttext_train_reviews.cuda()
fasttext_val_reviews = fasttext_val_reviews.cuda()
fasttext_test_reviews = fasttext_test_reviews.cuda()

In [109]:
fasttext_train_reviews.shape, fasttext_train_sentiments.shape, fasttext_test_reviews.shape, fasttext_test_sentiments.shape

(torch.Size([30000, 100]),
 torch.Size([30000, 2]),
 torch.Size([10000, 100]),
 torch.Size([10000, 2]))

In [110]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim) 

        # Non-linearity
        self.relu = nn.ReLU()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  

        # Non-linearity
        self.sigmoid_2 = nn.Sigmoid()

    def forward(self, x):
        # Linear function  # LINEAR
        out = self.fc1(x)

        # Non-linearity  # NON-LINEAR
        out = self.relu(out)

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)

        out = self.sigmoid_2(out)
        return out

In [111]:
input_dim = 100
hidden_dim = 50
output_dim = 2

learning_rate = 0.001
n_epochs = 500
batch_size = 1024

net = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  


In [112]:
net.train()
# Placeholder for loss
track_loss = []

print('Epoch', '\t', 'Loss train', '\t', 'Loss test')
for i in range(n_epochs):

    # shuffle the permutation, so that the batches each time are different
    shuffle_idx = np.random.permutation(len(fasttext_train_reviews))
    #split the tensor into smaller batches
    review_batches = torch.split(fasttext_train_reviews[shuffle_idx], batch_size)
    sentiment_batches = torch.split(fasttext_train_sentiments[shuffle_idx], batch_size)
    # Mini batches
    for j in range(len(review_batches)):
        review_batch = review_batches[j]

        sentiment_batch = sentiment_batches[j]
        
        #where we train and build the network
        output_train = net(review_batch)


        loss = criterion(output_train, sentiment_batch)
    
        # compute gradients
        optimizer.zero_grad()
        loss.backward()
    
        # update the weights
        optimizer.step()

    # Keep track of loss at each epoch
    track_loss += [float(loss)]

    loss_epoch = f'{i + 1} / {n_epochs}'
    with torch.no_grad():
        output_train = net(fasttext_train_reviews)
        loss_train = criterion(output_train, fasttext_train_sentiments)
        loss_epoch += f'\t {loss_train:.4f}'

        output_test = net(fasttext_val_reviews)
        loss_test = criterion(output_test, fasttext_val_sentiments)
        loss_epoch += f'\t\t {loss_test:.4f}'

    print(loss_epoch)

Epoch 	 Loss train 	 Loss test
1 / 500	 0.6885		 0.6886
2 / 500	 0.6788		 0.6794
3 / 500	 0.6620		 0.6631
4 / 500	 0.6401		 0.6419
5 / 500	 0.6173		 0.6199
6 / 500	 0.5957		 0.5990
7 / 500	 0.5763		 0.5804
8 / 500	 0.5590		 0.5636
9 / 500	 0.5441		 0.5490
10 / 500	 0.5315		 0.5369
11 / 500	 0.5209		 0.5263
12 / 500	 0.5120		 0.5176
13 / 500	 0.5045		 0.5102
14 / 500	 0.4982		 0.5038
15 / 500	 0.4927		 0.4983
16 / 500	 0.4880		 0.4937
17 / 500	 0.4838		 0.4894
18 / 500	 0.4802		 0.4857
19 / 500	 0.4770		 0.4825
20 / 500	 0.4741		 0.4795
21 / 500	 0.4716		 0.4769
22 / 500	 0.4696		 0.4751
23 / 500	 0.4674		 0.4728
24 / 500	 0.4653		 0.4703
25 / 500	 0.4637		 0.4686
26 / 500	 0.4619		 0.4668
27 / 500	 0.4603		 0.4652
28 / 500	 0.4591		 0.4641
29 / 500	 0.4580		 0.4627
30 / 500	 0.4565		 0.4614
31 / 500	 0.4553		 0.4601
32 / 500	 0.4543		 0.4590
33 / 500	 0.4539		 0.4585
34 / 500	 0.4526		 0.4573
35 / 500	 0.4517		 0.4564
36 / 500	 0.4508		 0.4554
37 / 500	 0.4501		 0.4547
38 / 500	 0.4493

In [113]:
predict_val = net(fasttext_val_reviews)
predict_test = net(fasttext_test_reviews)

In [114]:
predict_val_result = []
for (neg_prob, pos_prob) in predict_val:
    if (neg_prob > pos_prob):
        predict = "negative"
    else:
        predict = "positive"
    predict_val_result.append(predict)

predict_test_result = []
for (neg_prob, pos_prob) in predict_test:
    if (neg_prob > pos_prob):
        predict = "negative"
    else:
        predict = "positive"
    predict_test_result.append(predict)


In [115]:
ffnn_val_score = accuracy_score(val_sentiments, predict_val_result)
ffnn_test_score = accuracy_score(test_sentiments, predict_test_result)
print(ffnn_val_score)
print(ffnn_test_score)

0.8724
0.8774


In [116]:
0.8734, 0.8775


0.8717, 0.8779

(0.8717, 0.8779)