# Introduction



This code is trying to build movie review sentiment classifier using bag-of-words features and Logistic Regression as baseline

# Import

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
import torch
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from gensim.models import Word2Vec
import gensim
from gensim import corpora
from tqdm import tqdm
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from collections import Counter
from sklearn.metrics import accuracy_score

# load data and preprocessing

In [None]:
df = pd.read_csv("movie_review_RT50K.csv")


In [None]:
df['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df['content']] 
df.head(3)

Unnamed: 0,content,sentiment,tokenized_text
0,"Compelling in fits and starts, actor-director ...",0,"[compelling, in, fits, and, starts, actor, dir..."
1,Quite simply one of the finest comic romances ...,1,"[quite, simply, one, of, the, finest, comic, r..."
2,A psychological thriller that dangles over the...,0,"[psychological, thriller, that, dangles, over,..."


In [None]:
# porter_stemmer = PorterStemmer()
# df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokenized_text'] ]
df.head(3)

Unnamed: 0,content,sentiment,tokenized_text,stemmed_tokens
0,"Compelling in fits and starts, actor-director ...",0,"[compelling, in, fits, and, starts, actor, dir...","[compel, in, fit, and, start, actor, director,..."
1,Quite simply one of the finest comic romances ...,1,"[quite, simply, one, of, the, finest, comic, r...","[quit, simpli, on, of, the, finest, comic, rom..."
2,A psychological thriller that dangles over the...,0,"[psychological, thriller, that, dangles, over,...","[psycholog, thriller, that, dangl, over, the, ..."


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df[['content','tokenized_text','stemmed_tokens']], 
                                                    df['sentiment'], 
                                                    shuffle=True,
                                                    test_size=0.2, 
                                                    random_state=2021)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(40000, 3)
(10000, 3)
(40000,)
(10000,)


change targets to tensor

In [None]:
train_y = torch.tensor(Y_train.tolist())
test_y = torch.tensor(Y_test.tolist())
print(train_y.shape)
print(test_y.shape)

torch.Size([40000])
torch.Size([10000])


In [None]:
train_X=X_train.stemmed_tokens
test_X=X_test.stemmed_tokens

# create function to generate bag of words feature

In [None]:
def make_dict(df, padding=True):
    if padding:
        review_dict = corpora.Dictionary([['pad']])
        review_dict.add_documents(df['stemmed_tokens'])
    else:
        review_dict = corpora.Dictionary(df['stemmed_tokens'])
    return review_dict

review_dict = make_dict(df, padding=False)

In [None]:

VOCAB_SIZE = len(review_dict)#24832
NUM_LABELS = 2

def make_bow_vector(review_dict, sentence):
    vec = torch.zeros(VOCAB_SIZE+1, dtype=torch.float64)
    for word in sentence:
        vec[review_dict.token2id[word]] += 1
    return vec.view(1, -1).float()

temp=make_bow_vector(review_dict,['compel','in','fit','and','start','actor','director'])
print(temp.shape)

torch.Size([1, 24833])


the size of this bag of words feature is 24832, which is impractical to generate before hand, you have to do it on gpu as you train the model

# define logistic regression classifier

In [None]:
class LRBOW(nn.Module):  

    def __init__(self, num_labels, vocab_size):

        super(LRBOW, self).__init__()

        self.linear = nn.Linear(vocab_size, num_labels)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, bow_vec): 

        x=self.linear(bow_vec)
        x=self.softmax(x)

        return x 


# tempc=make_bow_vector(review_dict,X_train.stemmed_tokens[0])
# tempc.shape
# model=LRBOW(NUM_LABELS,VOCAB_SIZE)
# model(tempc)


# training

In [None]:
trainLen=len(train_X)
testLen=len(test_y)
print(trainLen,testLen)

40000 10000


In [None]:
device = torch.device("cuda")

In [None]:
%%time
BATCH_SIZE=100
VOCAB_SIZE = len(review_dict)#24832
NUM_LABELS = 2

model=LRBOW(NUM_LABELS,VOCAB_SIZE+1) #+1 for the interception
model.to(device)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

num_epochs=20
model.train()

for epoch in range(num_epochs):
  print("Epoch" + str(epoch + 1),'started')
  model.zero_grad()
  trainLoss=0;
  for i in tqdm(range(0,trainLen,BATCH_SIZE)):
      model.zero_grad()
  
      endIndex=min(i+BATCH_SIZE,trainLen)
      labels=train_y[i:endIndex].to(device) 

      tempX=[]

      for j in range(i,min(i+BATCH_SIZE,trainLen)):
        v = make_bow_vector(review_dict,train_X.iloc[j])
        tempX.append(v)        

      tempX=torch.cat(tempX,dim=-2).to(device)
      labels=train_y[i:endIndex].to(device)

      output = model(tempX)


      loss = loss_function(output, labels)
      trainLoss = trainLoss + loss.item()
      loss.backward()
      optimizer.step()


  trainLoss=round(trainLoss / trainLen,5)
  print('loss',loss.item())




  1%|▏         | 5/400 [00:00<00:09, 42.80it/s]

Epoch1 started


100%|██████████| 400/400 [00:10<00:00, 36.41it/s]
  1%|▏         | 5/400 [00:00<00:10, 39.09it/s]

loss 0.6891687512397766
Epoch2 started


100%|██████████| 400/400 [00:10<00:00, 37.20it/s]
  1%|          | 4/400 [00:00<00:10, 38.80it/s]

loss 0.6825233697891235
Epoch3 started


100%|██████████| 400/400 [00:11<00:00, 35.67it/s]
  1%|          | 4/400 [00:00<00:11, 35.67it/s]

loss 0.6752989292144775
Epoch4 started


100%|██████████| 400/400 [00:10<00:00, 37.03it/s]
  1%|▏         | 5/400 [00:00<00:09, 40.63it/s]

loss 0.6682677268981934
Epoch5 started


100%|██████████| 400/400 [00:10<00:00, 36.82it/s]
  1%|          | 4/400 [00:00<00:12, 32.13it/s]

loss 0.6617194414138794
Epoch6 started


100%|██████████| 400/400 [00:11<00:00, 34.61it/s]
  1%|▏         | 5/400 [00:00<00:09, 39.97it/s]

loss 0.6556941866874695
Epoch7 started


100%|██████████| 400/400 [00:10<00:00, 37.51it/s]
  1%|          | 4/400 [00:00<00:09, 39.82it/s]

loss 0.6501526832580566
Epoch8 started


100%|██████████| 400/400 [00:10<00:00, 36.98it/s]
  1%|          | 3/400 [00:00<00:16, 23.48it/s]

loss 0.6450387835502625
Epoch9 started


100%|██████████| 400/400 [00:10<00:00, 38.63it/s]
  1%|▏         | 5/400 [00:00<00:09, 42.52it/s]

loss 0.6402994394302368
Epoch10 started


100%|██████████| 400/400 [00:10<00:00, 37.43it/s]
  1%|▏         | 5/400 [00:00<00:09, 42.47it/s]

loss 0.6358882188796997
Epoch11 started


100%|██████████| 400/400 [00:11<00:00, 35.33it/s]
  1%|          | 3/400 [00:00<00:15, 25.64it/s]

loss 0.6317654848098755
Epoch12 started


100%|██████████| 400/400 [00:10<00:00, 37.67it/s]
  1%|▏         | 5/400 [00:00<00:09, 42.03it/s]

loss 0.6278976202011108
Epoch13 started


100%|██████████| 400/400 [00:10<00:00, 37.13it/s]
  1%|▏         | 5/400 [00:00<00:09, 40.15it/s]

loss 0.6242559552192688
Epoch14 started


100%|██████████| 400/400 [00:11<00:00, 35.34it/s]
  1%|          | 3/400 [00:00<00:14, 26.78it/s]

loss 0.6208153963088989
Epoch15 started


100%|██████████| 400/400 [00:11<00:00, 35.94it/s]
  1%|          | 3/400 [00:00<00:15, 25.45it/s]

loss 0.6175544857978821
Epoch16 started


100%|██████████| 400/400 [00:10<00:00, 38.46it/s]
  1%|▏         | 5/400 [00:00<00:09, 40.19it/s]

loss 0.6144548654556274
Epoch17 started


100%|██████████| 400/400 [00:10<00:00, 37.58it/s]
  1%|▏         | 5/400 [00:00<00:09, 40.57it/s]

loss 0.6115000247955322
Epoch18 started


100%|██████████| 400/400 [00:10<00:00, 38.51it/s]
  1%|          | 4/400 [00:00<00:10, 37.60it/s]

loss 0.6086758375167847
Epoch19 started


100%|██████████| 400/400 [00:10<00:00, 37.31it/s]
  1%|▏         | 5/400 [00:00<00:09, 41.84it/s]

loss 0.6059700846672058
Epoch20 started


100%|██████████| 400/400 [00:09<00:00, 41.24it/s]

loss 0.6033716797828674
CPU times: user 3min 35s, sys: 2.26 s, total: 3min 37s
Wall time: 3min 35s





In [None]:
BATCH_SIZE=100
y_pred = []
model.eval()

with torch.no_grad():
     for i in tqdm(range(0,testLen,BATCH_SIZE)):
        endIndex=min(i+BATCH_SIZE,testLen)

        tempX=[]

        for j in range(i,min(i+BATCH_SIZE,testLen)):
          v = make_bow_vector(review_dict,test_X.iloc[j])
          tempX.append(v)        

        tempX=torch.cat(tempX,dim=-2).to(device)
        output = model(tempX)

        predicted_class=torch.argmax(output,dim=-1).cpu().tolist()        
        y_pred.extend(predicted_class)      

print()
print(classification_report(Y_test.tolist(),y_pred))

100%|██████████| 100/100 [00:02<00:00, 35.80it/s]


              precision    recall  f1-score   support

           0       0.72      0.73      0.73      5028
           1       0.72      0.71      0.72      4972

    accuracy                           0.72     10000
   macro avg       0.72      0.72      0.72     10000
weighted avg       0.72      0.72      0.72     10000




