In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
import time

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
path = "/content/gdrive/MyDrive/ML Project/Project Report/"

In [None]:
data = pd.read_csv(path+"/Datasets/IMDB Dataset.csv")

### **Cleaning the text**

In [None]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # text = re.sub('(.*?)','',text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stopwords
    words = [w for w in words if w not in stopwords.words('english')]
    # Stem the words
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    # Join the words back into a string
    text = ' '.join(words)
    return text

In [None]:
tqdm.pandas()
data['cleaned_text'] = data['review'].progress_apply(clean_text)

### **Loading the cleaned comments**

In [None]:
data = pd.read_csv(path+"Datasets/out-2.csv")

In [None]:
tqdm.pandas()
def transform_label(label):
  return 1 if label=="positive" else 0
data['label'] = data['sentiment'].progress_apply(transform_label)
data.head()

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,review,sentiment,cleaned_text,label
0,0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod hook right ...,1
1,1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...,1
2,2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...,1
3,3,Basically there's a family where a little boy ...,negative,basic famili littl boy jake think zombi closet...,0
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...,1


In [None]:
cv = CountVectorizer(max_features = 512)
# cv = CountVectorizer()
X = cv.fit_transform(data['cleaned_text']).toarray()
y = data['sentiment']

### **Logistic Regression for Sentiment Analysis**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,shuffle=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter = 1000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.8474666666666667


**Saving the model**

In [None]:
import pickle
filename = 'ohe_model.pkl'

with open(filename, 'wb') as fout:
    pickle.dump((cv, model), fout)

### **Sentiment Analysis using LSTM**

In [None]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
X,y = data['cleaned_text'].values, data['sentiment'].values


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,shuffle=False)

In [None]:
def tokenize(X_train,y_train,X_test,y_test):
    word_list = []

    stop_words = set(stopwords.words('english')) 
    for sent in X_train:
        for word in sent.lower().split():
            # word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)
  
    corpus = Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]
    # creating a dict
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
    
    # tokenize
    final_list_train,final_list_test = [],[]
    for sent in X_train:
            final_list_train.append([onehot_dict[word] for word in sent.lower().split() 
                                     if word in onehot_dict.keys()])
    for sent in X_test:
            final_list_test.append([onehot_dict[word] for word in sent.lower().split() 
                                    if word in onehot_dict.keys()])
            
    encoded_train = [1 if label =='positive' else 0 for label in y_train]  
    encoded_test = [1 if label =='positive' else 0 for label in y_test] 
    return np.array(final_list_train, dtype="object"), np.array(encoded_train),np.array(final_list_test), np.array(encoded_test),onehot_dict

In [None]:
x_train,y_train,x_test,y_test,vocab = tokenize(X_train,y_train,X_test,y_test)

  return np.array(final_list_train, dtype="object"), np.array(encoded_train),np.array(final_list_test), np.array(encoded_test),onehot_dict


In [None]:
print(f'Length of vocabulary is {len(vocab)}')

Length of vocabulary is 1000


In [None]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [None]:
#we have very less number of reviews with length > 500.
#So we will consideronly those below it.
x_train_pad = padding_(x_train,500)
x_test_pad = padding_(x_test,500)

In [None]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [None]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(iter(train_loader))

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([50, 500])
Sample input: 
 tensor([[  0,   0,   0,  ...,  16, 198,  18],
        [  0,   0,   0,  ..., 527,  53, 199],
        [  0,   0,   0,  ...,  28,  13, 760],
        ...,
        [  0,   0,   0,  ..., 679, 671,  70],
        [  0,   0,   0,  ...,  18,  70, 557],
        [  0,   0,   0,  ..., 698, 177,  70]])
Sample input: 
 tensor([1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
        0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
        0, 1])


In [None]:
# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [None]:

class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
        
        
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

In [None]:
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256


model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

print(model)

SentimentRNN(
  (embedding): Embedding(1001, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
learning_rate = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
clip = 5
epochs = 5 
valid_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state 
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device), labels.to(device)   
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        
        model.zero_grad()
        output,h = model(inputs,h)
        
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
 
    
        
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_losses.append(val_loss.item())
            
            accuracy = acc(output,labels)
            val_acc += accuracy
            
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), 'saved_model.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    print(25*'==')
    

Epoch 1
train_loss : 0.6837761270999908 val_loss : 1.4686259472370147
train_accuracy : 54.15714285714286 val_accuracy : 50.06666666666667
Validation loss decreased (inf --> 1.468626).  Saving model ...
Epoch 2
train_loss : 0.4816354472083705 val_loss : 0.35993853042523066
train_accuracy : 77.09428571428572 val_accuracy : 84.44
Validation loss decreased (1.468626 --> 0.359939).  Saving model ...
Epoch 3
train_loss : 0.3399895536473819 val_loss : 0.3295093301186959
train_accuracy : 85.45142857142856 val_accuracy : 86.14
Validation loss decreased (0.359939 --> 0.329509).  Saving model ...
Epoch 4
train_loss : 0.3052210304566792 val_loss : 0.32481164346138636
train_accuracy : 87.09142857142858 val_accuracy : 86.42
Validation loss decreased (0.329509 --> 0.324812).  Saving model ...
Epoch 5
train_loss : 0.28127304433711936 val_loss : 0.314989915539821
train_accuracy : 88.25714285714285 val_accuracy : 86.78666666666666
Validation loss decreased (0.324812 --> 0.314990).  Saving model ...


**Saving the LSTM Model**

In [None]:
model_save_name = 'saved_model_lstm_86.pt'
path = F"/content/gdrive/My Drive/ML Project/{model_save_name}"
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
def predict_text(text):
        word_seq = np.array([vocab[word] for word in text.split() 
                         if word in vocab.keys()])
        word_seq = np.expand_dims(word_seq,axis=0)
        pad =  torch.from_numpy(padding_(word_seq,500))
        inputs = pad.to(device)
        batch_size = 1
        h = model.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        output, h = model(inputs, h)
        return(output.item())

In [None]:
index = 60
print(data['cleaned_text'][index])
print('='*70)
print(f'Actual sentiment is  : {data["sentiment"][index]}')
print('='*70)
pro = predict_text(data['cleaned_text'][index])
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro
print(f'Predicted sentiment is {status} with a probability of {pro}')

happen basic solid plausibl premis decent talent cast somewher movi lose actual never realli got go littl excit find angi realli pregnant find steve martin talent person usual bring lot movi dread entir charact even close import movi make longer realli would like see interact main charact kate angi mayb tri pure comedi unfortun mayb drama comed element think movi could funni sinc actress quit funni way sit think numer scenario would riot
Actual sentiment is  : negative
Predicted sentiment is negative with a probability of 0.8907629922032356


In [None]:
guard_gal_comments = pd.read_csv('/content/gdrive/MyDrive/ML Project/movie_comments/movieCommentsCleaned/movieCommentsCleaned1.csv')

In [None]:
guard_gal_comments.head(5)

Unnamed: 0.1,Unnamed: 0,ID,comments,cleaned_comments
0,0,KIpGKumxiGg,M. Night better not fuck this up.,night better fuck
1,1,KIpGKumxiGg,The window thing made me laugh ha ha ha T_T,window thing made laugh ha ha ha
2,2,KIpGKumxiGg,Anyone else think Kevin was Michael Fassbender?,anyon els think kevin michael fassbend
3,3,KIpGKumxiGg,I was on board until he started &quot;altering...,board start quot alter bodi chemistri mind quot
4,4,KIpGKumxiGg,My god professor what happened to you?,god professor happen


In [None]:
def classify_comments(comments):

  for comment in comments:
    pro = predict_text(comment)
    status = "positive" if pro > 0.4 else "negative"
    pro = (1 - pro) if status == "negative" else pro
    print(f'Predicted sentiment is {status} with a probability of {pro}')


In [None]:
gog_comments = guard_gal_comments['cleaned_comments'].values[:5]
classify_comments(gog_comments)

Predicted sentiment is negative with a probability of 0.6350912153720856
Predicted sentiment is positive with a probability of 0.6268795728683472
Predicted sentiment is positive with a probability of 0.5481216907501221
Predicted sentiment is negative with a probability of 0.7138091027736664
Predicted sentiment is positive with a probability of 0.47551047801971436
