<a href="https://colab.research.google.com/github/Charles-Eret/Sentiment-Classifier/blob/main/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# @title
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (eg. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# @title
# GPU for general-purpose computing availability check
is_cuda = torch.cuda.is_available()
if is_cuda:
  device = torch.device("cuda")
  print("GPU is available")
else:
  device = torch.device("cpu")
  print("GPU not available, CPU used")

GPU is available


In [None]:
# @title
print("Current working directory:", os.getcwd())

Current working directory: /content


In [None]:
# @title
base_csv = r"/content/drive/MyDrive/Colab Notebooks/Sentiment Analysis/IMDB Dataset.csv"
df = pd.read_csv(base_csv)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# @title
# splitting the data into train-test split
X,y = df['review'].values,df['sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y,train_size=0.7)

In [None]:
# @title
# preprocessing
def preprocess_string(s):
  # remove all the non-word characters (everything except numbers and letters)
  s = re.sub(r"[^\w\s]", '', s)
  # replace all runs of whitespaces with no space
  s = re.sub(r"\s+", '', s)
  # replace digits with no space
  s = re.sub(r"\d", '', s)

  return s

# padding to make each sentence equal length
def padding_(sentences, seq_len):
  # initializing a 2D NumPy array
  features = np.zeros((len(sentences), seq_len), dtype=int)
  #for index, integer list
  for ii,review in enumerate(sentences):
    if len(review) != 0:
      #adding sentence to right edge of row entry, truncating it to seq_len
      features[ii, -len(review):] = np.array(review)[:seq_len]
  return features

# tokenization
def tokenize(x_train,y_train,x_val,y_val):
  word_list = []
  # set of words to filter out
  stop_words = set(stopwords.words('english'))
  for sentence in x_train:
    for word in sentence.lower().split():
      word = preprocess_string(word)
      if word not in stop_words and word != '':
        word_list.append(word)

  # dict subclass - key=word, value=count
  corpus = Counter(word_list)
  # sorts corpus to most common words, returns the top 1000
  corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]
  # creating a dict with the entry value corresponding to the ranking (how common) the word is
  onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}

  final_list_train,final_list_test = [],[]

  #preparing the reviews. Each word in a sentence is mapped to its ranking (how common)
  for sentence in x_train:
    final_list_train.append([onehot_dict[preprocess_string(word)] for word in sentence.lower().split() if preprocess_string(word) in onehot_dict.keys()])
  for sentence in x_val:
    final_list_test.append([onehot_dict[preprocess_string(word)] for word in sentence.lower().split() if preprocess_string(word) in onehot_dict.keys()])

  #preparing the labels
  encoded_train = [1 if label == 'positive' else 0 for label in y_train]
  encoded_test = [1 if label == 'positive' else 0 for label in y_val]

  return padding_(final_list_train, 500), np.array(encoded_train), padding_(final_list_test,500), np.array(encoded_test), onehot_dict

In [None]:
# @title
x_train_pad,y_train,x_test_pad,y_test,vocab = tokenize(x_train,y_train,x_test,y_test)

In [None]:
# @title
#preparing datasets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

batch_size = 50

#shuffling and batching data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [None]:
# @title
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

In [None]:
# @title
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        # no_layers: The number of LSTM layers in the model (stacked LSTMs).
        # vocab_size: The size of the vocabulary (number of unique words in the dataset). For this it's 1000
        # hidden_dim: The number of hidden units in the LSTM.
        # embedding_dim: The size of the word embeddings (vector representation of words).
        # drop_prob: Dropout probability to prevent overfitting (default is 0.5)


        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.no_layers = no_layers
        self.vocab_size = vocab_size

        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1] # get last batch of labels

        # return last sigmoid output and hidden state
        return sig_out, hidden



    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

In [None]:
# @title
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256

In [None]:
# @title
# initialize model
model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

SentimentRNN(
  (embedding): Embedding(1001, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [None]:
# @title
#create the loss and optimization functions along with the accuracy method
#loss function: measures how far the model’s predictions are from the actual target labels, used for training
criterion = nn.BCELoss()

#optimizer: responsible for updating the model's parameters (weights and biases) during training to minimize the loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

#accuracy function: calculates the accuracy of the model’s predictions by comparing the prediction probabilities to the labels, used to evaluate performance
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [None]:
# @title
clip = 5
epochs = 5
valid_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    #training model
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:

        inputs, labels = inputs.to(device), labels.to(device)
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        model.zero_grad()
        output,h = model(inputs,h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()


    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    #model validation
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_losses.append(val_loss.item())

            accuracy = acc(output,labels)
            val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)

In [None]:
# @title
def predict_text(text):
        word_seq = np.array([vocab[preprocess_string(word)] for word in text.split()
                         if preprocess_string(word) in vocab.keys()])
        word_seq = np.expand_dims(word_seq,axis=0)
        pad =  torch.from_numpy(padding_(word_seq,500))
        inputs = pad.to(device)
        batch_size = 1
        h = model.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        output, h = model(inputs, h)
        return(output.item())


In [None]:
# @title
index = 30
print(df['review'][index])
print('='*70)
print(f'Actual sentiment is  : {df["sentiment"][index]}')
print('='*70)
pro = predict_text(df['review'][index])
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro
print(f'Predicted sentiment is {status} with a probability of {pro}')

Taut and organically gripping, Edward Dmytryk's Crossfire is a distinctive suspense thriller, an unlikely "message" movie using the look and devices of the noir cycle.<br /><br />Bivouacked in Washington, DC, a company of soldiers cope with their restlessness by hanging out in bars. Three of them end up at a stranger's apartment where Robert Ryan, drunk and belligerent, beats their host (Sam Levene) to death because he happens to be Jewish. Police detective Robert Young investigates with the help of Robert Mitchum, who's assigned to Ryan's outfit. Suspicion falls on the second of the three (George Cooper), who has vanished. Ryan slays the third buddy (Steve Brodie) to insure his silence before Young closes in.<br /><br />Abetted by a superior script by John Paxton, Dmytryk draws precise performances from his three starring Bobs. Ryan, naturally, does his prototypical Angry White Male (and to the hilt), while Mitchum underplays with his characteristic alert nonchalance (his role, howeve