In [95]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import pickle
import string


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16514\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16514\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import the dataset

In [96]:
imdb_data = pd.read_csv('./data/IMDB Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Generate the Training and Testing data by dividing the dataset as 4:1

In [97]:
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]

test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]



In [98]:
def preprocess(reviews):
    container = []
    for review in reviews:
        review = review.replace("<br />", "")
        for ele in string.punctuation:
                if ele in review:
                        review = review.replace(ele, "")
        container.append(review)
    return container

def preprocess_sentiment(sentiments):
    container = []
    for sentiment in sentiments:
        container.append(sentiment)
    return container

In [99]:
train_reviews=preprocess(train_reviews)
train_sentiments=preprocess_sentiment(train_sentiments)

test_reviews=preprocess(test_reviews)
test_sentiments=preprocess_sentiment(test_sentiments)

# fastText method

In [100]:
import fasttext

In [101]:
with open("data/train.txt", "w") as f:
    for review in train_reviews:
        f.write(review)
        f.write("\n")
with open("data/test.txt", "w") as f:
    for review in test_reviews:
        f.write(review)
        f.write("\n")

# Fasttext manually modify the sentence embedding method

In [102]:
# model = fasttext.train_unsupervised("data/train.txt")
# model.save_model("data/train.bin")
model = fasttext.load_model("data/train.bin")

In [103]:
len(model.words)

45409

In [104]:
len(model.get_word_vector("the"))

100

In [105]:
word_dict = pickle.load(open("data/weight.pickle", "rb"))

def transform_sentence_embedding(data, word_dict):
    representations = []
    for sentence in data:
        nltk_tokens = sentence.strip().split()
        sum = 0
        for token in nltk_tokens:
            if token in word_dict:
                sum += model.get_word_vector(token) * word_dict[token]
            else:
                sum += model.get_word_vector(token)
        representations.append(sum / len(nltk_tokens))
    return representations

In [106]:
def transform_label(data):
    representations = []
    for label in data:
        if(label == 'negative'):
            representations.append(0)
        else:
            representations.append(1)
    return representations


In [227]:
for k,v in word_dict.items():
    if v < 0:
        word_dict[k] = -1
    if v > 0 and v < 1:
        word_dict[k] = 0
    if v > 1:
        word_dict[k] = 2

fasttext_train_reviews = torch.Tensor(np.array(transform_sentence_embedding(train_reviews, word_dict)))
fasttext_test_reviews = torch.Tensor(np.array(transform_sentence_embedding(test_reviews, word_dict)))

In [228]:
fasttext_train_sentiments = nn.functional.one_hot(torch.Tensor(np.array(transform_label(train_sentiments))).to(torch.int64)).to(torch.float).cuda()
fasttext_test_sentiments = nn.functional.one_hot(torch.Tensor(np.array(transform_label(test_sentiments))).to(torch.int64)).to(torch.float).cuda()

In [229]:
fasttext_train_reviews = fasttext_train_reviews.cuda()
fasttext_test_reviews = fasttext_test_reviews.cuda()
fasttext_train_sentiments = fasttext_train_sentiments.cuda()
fasttext_test_sentiments = fasttext_test_sentiments.cuda()

In [230]:
fasttext_train_reviews.shape, fasttext_train_sentiments.shape, fasttext_test_reviews.shape, fasttext_test_sentiments.shape

(torch.Size([40000, 100]),
 torch.Size([40000, 2]),
 torch.Size([10000, 100]),
 torch.Size([10000, 2]))

In [231]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim) 

        # Non-linearity
        self.relu = nn.ReLU()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  

        # Non-linearity
        self.sigmoid_2 = nn.Sigmoid()

    def forward(self, x):
        # Linear function  # LINEAR
        out = self.fc1(x)

        # Non-linearity  # NON-LINEAR
        out = self.relu(out)

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)

        out = self.sigmoid_2(out)
        return out

In [232]:
input_dim = 100
hidden_dim = 50
output_dim = 2

learning_rate = 0.001
n_epochs = 500
batch_size = 1024

net = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  


In [233]:
net.train()
# Placeholder for loss
track_loss = []

print('Epoch', '\t', 'Loss train', '\t', 'Loss test')
for i in range(n_epochs):

    # shuffle the permutation, so that the batches each time are different
    shuffle_idx = np.random.permutation(len(fasttext_train_reviews))
    #split the tensor into smaller batches
    review_batches = torch.split(fasttext_train_reviews[shuffle_idx], batch_size)
    sentiment_batches = torch.split(fasttext_train_sentiments[shuffle_idx], batch_size)
    # Mini batches
    for j in range(len(review_batches)):
        review_batch = review_batches[j]

        sentiment_batch = sentiment_batches[j]
        
        #where we train and build the network
        output_train = net(review_batch)


        loss = criterion(output_train, sentiment_batch)
    
        # compute gradients
        optimizer.zero_grad()
        loss.backward()
    
        # update the weights
        optimizer.step()

    # Keep track of loss at each epoch
    track_loss += [float(loss)]

    loss_epoch = f'{i + 1} / {n_epochs}'
    with torch.no_grad():
        output_train = net(fasttext_train_reviews)
        loss_train = criterion(output_train, fasttext_train_sentiments)
        loss_epoch += f'\t {loss_train:.4f}'

        output_test = net(fasttext_test_reviews)
        loss_test = criterion(output_test, fasttext_test_sentiments)
        loss_epoch += f'\t\t {loss_test:.4f}'

    print(loss_epoch)

Epoch 	 Loss train 	 Loss test
1 / 500	 0.6841		 0.6843
2 / 500	 0.6614		 0.6620
3 / 500	 0.6304		 0.6315
4 / 500	 0.6015		 0.6032
5 / 500	 0.5761		 0.5783
6 / 500	 0.5546		 0.5573
7 / 500	 0.5365		 0.5396
8 / 500	 0.5220		 0.5253
9 / 500	 0.5100		 0.5135
10 / 500	 0.5005		 0.5043
11 / 500	 0.4920		 0.4958
12 / 500	 0.4856		 0.4896
13 / 500	 0.4802		 0.4841
14 / 500	 0.4751		 0.4791
15 / 500	 0.4714		 0.4754
16 / 500	 0.4678		 0.4717
17 / 500	 0.4647		 0.4688
18 / 500	 0.4623		 0.4664
19 / 500	 0.4599		 0.4639
20 / 500	 0.4581		 0.4622
21 / 500	 0.4570		 0.4610
22 / 500	 0.4545		 0.4586
23 / 500	 0.4530		 0.4570
24 / 500	 0.4518		 0.4558
25 / 500	 0.4503		 0.4543
26 / 500	 0.4489		 0.4529
27 / 500	 0.4481		 0.4520
28 / 500	 0.4477		 0.4517
29 / 500	 0.4464		 0.4504
30 / 500	 0.4452		 0.4491
31 / 500	 0.4444		 0.4483
32 / 500	 0.4443		 0.4482
33 / 500	 0.4430		 0.4469
34 / 500	 0.4425		 0.4463
35 / 500	 0.4434		 0.4474
36 / 500	 0.4412		 0.4449
37 / 500	 0.4407		 0.4445
38 / 500	 0.4403

In [234]:
predict_val = net(fasttext_test_reviews)

In [235]:
predict_result = []
for (neg_prob, pos_prob) in predict_val:
    if (neg_prob > pos_prob):
        predict = "negative"
    else:
        predict = "positive"
    predict_result.append(predict)


In [236]:
ffnn_score = accuracy_score(test_sentiments, predict_result)
ffnn_score

0.8817

In [237]:
0.8812

0.8812