Christos Christidis

Build a vocabulary based on text, then perform binary classification towards two labels positive/negative based on movie reviews taken from the IMDB database

# **Imports**

In [None]:
# %pip install torchtext==0.6.0

In [1]:
import torchtext

In [None]:
# %pip install torch==2.3.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import random
import pandas as pd
import os

In [33]:
import sklearn
from sklearn.metrics import confusion_matrix
import numpy as np

In [None]:
#%pip install spacy

In [3]:
import spacy

In [4]:
print(torch.__version__)

2.3.0+cu121


In [5]:
torch.backends.cudnn.deterministic = True

Settings

In [10]:
path = '/Sentiment Analysis'

In [6]:
random_state = 1000
torch.manual_seed(random_state)

VOCABULARY_SIZE = 20000 # Vocabulary of 20k most used words, to avoid future overfitting
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2

In [7]:
DEVICE

device(type='cuda')

# **Preparing Data**

Loading the dataset

In [None]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
!gunzip -f movie_data.csv.gz

In [None]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


Use Spacy tokenizer since white space default tokenizing might not be as robust

In [None]:
# Features
TEXT = torchtext.data.Field(
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en_core_web_sm'
)

# Labels
LABEL = torchtext.data.LabelField(dtype=torch.long)

In [None]:
fields = [('review', TEXT), ('sentiment', LABEL)]

In [None]:
data = torchtext.data.TabularDataset(path = 'movie_data.csv', format = 'csv',
                                     skip_header = True, fields = fields)

In [None]:
# Splitting dataset

train, validation, test = data.split(split_ratio = [0.6, 0.2, 0.2],
                                     random_state = random.seed(random_state))

In [None]:
print('Total examples: ', len(train) + len(validation) + len(test))
print('Training examples: ', len(train))
print('Validation examples: ', len(validation))
print('Test examples: ', len(test))

Total examples:  50000
Training examples:  30000
Validation examples:  10000
Test examples:  10000


Training example

In [None]:
print(vars(train.examples[0]))

{'review': ['This', 'film', 'is', 'so', 'lovingly', 'made', 'you', 'want', 'to', 'be', 'part', 'of', 'it', 'forever', '.', 'The', 'flics', 'are', 'straight', 'but', 'not', 'without', 'malice', ',', 'the', 'goods', 'are', 'transparent', 'and', 'evildoers', 'are', 'hardly', 'there', '.', 'Even', 'the', '"', 'cabaret', '"', 'are', 'so', 'naive', 'they', "'ll", 'make', 'you', 'daydream', 'with', 'nostalgia', 'in', 'comparison', 'to', 'anything', 'available', 'on', 'TV', '.', 'Blier', 'is', 'fine', ',', 'if', 'a', 'bit', 'one', 'sided', '.', 'Louis', 'Jouvet', 'is', 'perfect', ',', 'you', 'just', 'ca', "n't", 'have', 'a', 'better', 'copper', '.', 'He', 'has', 'the', 'best', 'line', ':', '"', 'My', 'dad', 'cleaned', 'other', 'people', "'s", 'dirt', ',', 'and', 'I', 'do', 'the', 'same', '"', '.', 'Susy', 'Delair', 'is', 'unbearable', ',', 'but', 'I', 'guess', 'in', 'part', 'it', "'s", 'the', 'songs', ',', 'wardrobe', 'and', 'hairdo', '.', 'Simone', 'Renant', ',', 'on', 'the', 'contrary', ',',

**Building tokens**

In [None]:
TEXT.build_vocab(train, max_size = VOCABULARY_SIZE)
LABEL.build_vocab(train)

20000 words + uknown + pad tokens

In [None]:
len(TEXT.vocab)

20002

20 most frequently used words

In [None]:
TEXT.vocab.freqs.most_common(20)

[('the', 345270),
 (',', 328225),
 ('.', 281427),
 ('and', 186348),
 ('a', 186145),
 ('of', 171732),
 ('to', 158673),
 ('is', 129156),
 ('in', 104865),
 ('I', 93555),
 ('it', 91398),
 ('that', 83253),
 ('"', 76151),
 ("'s", 73368),
 ('this', 72269),
 ('-', 63389),
 ('/><br', 60500),
 ('was', 59655),
 ('movie', 51293),
 ('as', 51019)]

In [None]:
LABEL.vocab.freqs

Counter({'1': 14988, '0': 15012})

Classes are balanced, no need to handle class imbalance

In [None]:
LABEL.vocab.stoi

defaultdict(None, {'0': 0, '1': 1})

**Data Loaders**

In [None]:
train_loader, validation_loader, test_loader = torchtext.data.BucketIterator.splits(
    (train, validation, test), batch_size = BATCH_SIZE, sort_within_batch = False,
    sort_key = lambda x: len(x.review), device = DEVICE
)

# **Model**

* TODO: Modularize it. E.g. dictionary for the dims and then hidden = [torch.nn.LSTM, torch.nn.RNN] and make it as self.rnn = hidden[0]/hidden[1]
* Try different optimizer



In [16]:
class RNN(torch.nn.Module):

  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = torch.nn.Embedding(input_dim, embedding_dim) # Converts the words into the real value vectors
    self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim) # The lstm takes in the embeddings and produces the hidden activations
    self.fc = torch.nn.Linear(hidden_dim, output_dim) # Classification layer

  def forward(self, text):
    # text dim: [sentence length, batch size]
    embedded = self.embedding(text)
    # embedded dim: [sentence length, batch size, embedding dim]
    output, (hidden, cell) = self.rnn(embedded) # Outputs the y but also the hidden state for the next layer
    # output dim: [sentence length, batch size, hidden dim]
    # hidden dim: [1, batch size, hidden dim]

    hidden.squeeze_(0) # Removing the first dimension from the fc layer for compatibility
    # hidden dim: [batch size, hidden dim]

    output = self.fc(hidden) # The final output is the hidden state of the last cell that goes through a fully connected layer

    return output

In [None]:
torch.manual_seed(random_state)

model = RNN(input_dim = len(TEXT.vocab), embedding_dim = EMBEDDING_DIM,
            hidden_dim = HIDDEN_DIM, output_dim = NUM_CLASSES)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

# **Training**

In [13]:
def compute_accuracy(model, data_loader, device):

  with torch.no_grad():
    correct_pred, num_examples = 0, 0

    for i, (features, targets) in enumerate(data_loader):
      features = features.to(device)
      targets = targets.float().to(device)

      logits = model(features)
      _, predicted_labels = torch.max(logits, 1)

      num_examples += targets.size(0)
      correct_pred += (predicted_labels==targets).sum()
  return correct_pred.float()/num_examples * 100

In [15]:
def trainer(model, NUM_EPOCHS, DEVICE, train_loader, validation_loader,
            test_loader, optimizer):
  start_time = time.time()

  for epoch in range(NUM_EPOCHS):
      model.train()
      for batch_idx, batch_data in enumerate(train_loader):

          text = batch_data.review.to(DEVICE)
          labels = batch_data.sentiment.to(DEVICE)

          ### FORWARD AND BACK PROP
          logits = model(text)
          loss = F.cross_entropy(logits, labels)
          optimizer.zero_grad()

          loss.backward()

          ### UPDATE MODEL PARAMETERS
          optimizer.step()

          ### LOGGING
          if not batch_idx % 50:
              print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                    f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                    f'Loss: {loss:.4f}')

      with torch.set_grad_enabled(False):
          print(f'Training accuracy: '
                f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
                f'\nValidation accuracy: '
                f'{compute_accuracy(model, validation_loader, DEVICE):.2f}%')

      print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

  print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
  print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

In [None]:
trainer(model, NUM_EPOCHS, DEVICE, train_loader, validation_loader, test_loader,
        optimizer)

**Saving the model**

In [None]:
file_path = 'SentimentAnalysisLSTM.pth'
torch.save(model.state_dict(), file_path)
print("Model saved successfully.")

Model saved successfully.


# **Prediction**

In [30]:
spacy.prefer_gpu()
nlp = spacy.blank("en")

def predict_sentiment(model, sentence):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][0].item()

In [None]:
print('Probability Negative:')
predict_sentiment(model, "Pretty good movie, although with a sad ending!")

Probability Negative:


0.24909082055091858

In [None]:
print('Probability Positive:')
1 - predict_sentiment(model, "Pretty good movie, although with a sad ending!")

Probability Positive:


0.7509091794490814

In [None]:
test_reviews = ["Mixed feelings about this.",
                "Genuinely impressed! Everything was perfect!",
                "I didn't like a single second of this movie.",
                "This movie bangs! It's awesome!"]

for idx, review in enumerate(test_reviews):
  pred = predict_sentiment(model, review)
  print(review,  "\nProbability positive: " , f"{1 - pred:.2f}",
        "\nProbability Negative" , f"{pred:.2f}", "\n")


Mixed feelings about this. 
Probability positive:  0.01 
Probability Negative 0.99 

Genuinely impressed! Everything was perfect! 
Probability positive:  0.99 
Probability Negative 0.01 

I didn't like a single second of this movie. 
Probability positive:  0.35 
Probability Negative 0.65 

This movie bangs! It's awesome! 
Probability positive:  0.47 
Probability Negative 0.53 



# **Error Analysis**

Confussion Matrix

*   TODO: Visualize confusion matrix
*   TODO: Do more error analysis





In [None]:
# %pip install scikit-learn

In [29]:
def predict(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch.review
            outputs = model(inputs)
            predicted_labels = torch.argmax(outputs, dim=1)
            predictions.extend(predicted_labels.tolist())
            true_labels.extend(batch.sentiment.tolist())
    return predictions, true_labels

In [50]:
predictions, true_labels = predict(model, test_loader)

predictions_np = np.array(predictions)
true_labels_np = np.array(true_labels)


conf_matrix = confusion_matrix(true_labels_np, predictions_np)

print("Confusion Matrix:")
print(conf_matrix)


In [None]:
cfm = confusion_matrix(model, test_loader)

# **Amazon Product Reviews**

Loading the new dataset - Amazon product reviews and the pre-trained model

* TODO: Keep the RNN parameters saved somewhere so when loading I don't have to load the previous dataset. (input_dim, embedding_dim...)



In [12]:
import os

In [18]:
os.environ['KAGGLE_CONFIG_DIR'] = path

In [None]:
!kaggle datasets download -d yacharki/amazon-reviews-for-sa-binary-negative-positive-csv

amazon-reviews-for-sa-binary-negative-positive-csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile

with zipfile.ZipFile('amazon-reviews-for-sa-binary-negative-positive-csv.zip', 'r') as zip_ref:
    zip_ref.extractall(path)

Loading the new dataset

In [54]:
os.listdir(cwd)

['amazon_review_sa_binary_csv',
 'amazon_test.csv',
 'amazon_train.csv',
 'Logs',
 'LSTM_Sentiment_Analyis.ipynb',
 'movie_data.csv.gz',
 'SentimentAnalysisLSTM.pth']

In [None]:
df_amazon_train = pd.read_csv('amazonreview/train.csv')

In [61]:
df_amazon_test = pd.read_csv("amazonreview/test.csv")

In [None]:
df_amazon_train.head()

Unnamed: 0,class_index,review_title,review_text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [None]:
df_amazon_train = df_amazon_train.drop(['review_title'], axis=1)
df_amazon_test = df_amazon_test.drop(['review_title'], axis=1)

Formatting the dataset to fit the format of the previous dataset in order to be used for the data loaders.

In [None]:
df_amazon_train['class_index'] = df_amazon_train['class_index'].replace({1: 0, 2: 1})
df_amazon_train = df_amazon_train.rename(columns={'class_index': 'sentiment', 'review_text': 'review'})

df_amazon_test['class_index'] = df_amazon_test['class_index'].replace({1: 0, 2: 1})
df_amazon_test = df_amazon_test.rename(columns={'class_index': 'sentiment', 'review_text': 'review'})

In [None]:
new_order = ['review', 'sentiment'] + [col for col in df_amazon_train.columns if col not in ['review', 'sentiment']]
df_amazon_train = df_amazon_train[new_order]
new_order = ['review', 'sentiment'] + [col for col in df_amazon_test.columns if col not in ['review', 'sentiment']]
df_amazon_test = df_amazon_test[new_order]

In [None]:
df_amazon_train

Unnamed: 0,review,sentiment
0,This sound track was beautiful! It paints the ...,1
1,I'm reading a lot of reviews saying that this ...,1
2,This soundtrack is my favorite music of all ti...,1
3,I truly like this soundtrack and I enjoy video...,1
4,"If you've played the game, you know how divine...",1
...,...,...
3599995,The high chair looks great when it first comes...,0
3599996,I have used this highchair for 2 kids now and ...,0
3599997,"We have a small house, and really wanted two o...",0
3599998,not sure what this book is supposed to be. It ...,0


In [None]:
df_amazon_test

Unnamed: 0,review,sentiment
0,My lovely Pat has one of the GREAT voices of h...,1
1,Despite the fact that I have only played a sma...,1
2,I bought this charger in Jul 2003 and it worke...,0
3,Check out Maha Energy's website. Their Powerex...,1
4,Reviewed quite a bit of the combo players and ...,1
...,...,...
399995,We bought this Thomas for our son who is a hug...,0
399996,My son recieved this as a birthday gift 2 mont...,0
399997,"I bought this toy for my son who loves the ""Th...",0
399998,This is a compilation of a wide range of Mitfo...,1


In [None]:
df_amazon_train.head(50000).to_csv('amazon_train.csv', index=False) # Saving only 50k entries of the dataset
df_amazon_test.head(20000).to_csv('amazon_test.csv', index=False)

In [None]:
del df_amazon_train
del df_amazon_test
del new_order

# **Transfer Learning**

Producing the dataloaders - Run from here if train and test csv uploaded

In [18]:
TEXT = torchtext.data.Field(
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en_core_web_sm'
)

# Labels
LABEL = torchtext.data.LabelField(dtype=torch.long)

In [19]:
fields = [('review', TEXT), ('sentiment', LABEL)]

amazon_train_data = torchtext.data.TabularDataset(path = 'amazon_train.csv', format = 'csv',
                                     skip_header = True, fields = fields)
amazon_test_data = torchtext.data.TabularDataset(path = 'amazon_test.csv', format = 'csv',
                                     skip_header = True, fields = fields)

In [20]:
amazon_train_data, amazon_validation_data = amazon_train_data.split(split_ratio = [0.7, 0.3],
                                     random_state = random.seed(random_state))

In [21]:
TEXT.build_vocab(amazon_train_data, max_size = VOCABULARY_SIZE)
LABEL.build_vocab(amazon_train_data)

In [17]:
LABEL.vocab.freqs

Counter({'1': 17794, '0': 17206})

Again, classes are balanced in the new dataset so no further class imbalance methods need to be considered.

In [22]:
train_loader, validation_loader, test_loader = torchtext.data.BucketIterator.splits(
    (amazon_train_data, amazon_validation_data, amazon_test_data),
    batch_size = BATCH_SIZE, sort_within_batch = False,
    sort_key = lambda x: len(x.review), device = DEVICE
)

In [70]:
'''train_loader = torch.utils.data.DataLoader(amazon_train_data, batch_size = BATCH_SIZE, num_workers = 4)
validation_loader = torch.utils.data.DataLoader(amazon_validation_data, batch_size = BATCH_SIZE, num_workers = 4)
test_loader = torch.utils.data.DataLoader(amazon_test_data, batch_size = BATCH_SIZE, num_workers = 4)'''

In [23]:
file_name = 'SentimentAnalysisLSTM.pth'
pth_file_path = os.path.join(path, file_name)
pretrained_model = RNN(input_dim = len(TEXT.vocab), embedding_dim = EMBEDDING_DIM,
            hidden_dim = HIDDEN_DIM, output_dim = NUM_CLASSES)
pretrained_model.load_state_dict(torch.load(pth_file_path))
pretrained_model.to(DEVICE)

RNN(
  (embedding): Embedding(20002, 128)
  (rnn): LSTM(128, 256)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

Freezing the pre trained layers and evaluating the pretrained model on the new dataset

In [24]:
for param in pretrained_model.parameters():
    param.requires_grad = False

In [70]:
print(f'Test accuracy: {compute_accuracy(pretrained_model, test_loader, DEVICE):.2f}%')

Test accuracy: 51.67%


In [12]:
class PretrainedRNN(torch.nn.Module):
    def __init__(self, pretrained_model, num_layers_to_add):
        super().__init__()
        self.pretrained_model = pretrained_model
        self.num_layers_to_add = num_layers_to_add

        # Freeze the parameters of the pretrained model
        for param in self.pretrained_model.parameters():
            param.requires_grad = False

        # Add additional LSTM layers
        self.additional_layers = torch.nn.ModuleList([
            torch.nn.LSTM(pretrained_model.rnn.hidden_size, pretrained_model.rnn.hidden_size)
            for _ in range(num_layers_to_add)
        ])

    def forward(self, text):
        embedded = self.pretrained_model.embedding(text)
        output, (hidden, cell) = self.pretrained_model.rnn(embedded)

        # Pass through additional LSTM layers
        for layer in self.additional_layers:
            output, (hidden, cell) = layer(output)

        hidden.squeeze_(0)
        output = self.pretrained_model.fc(hidden)

        return output

In [56]:
ext_model = PretrainedRNN(pretrained_model, num_layers_to_add=2)
ext_model.to(DEVICE)

PretrainedRNN(
  (pretrained_model): RNN(
    (embedding): Embedding(20002, 128)
    (rnn): LSTM(128, 256)
    (fc): Linear(in_features=256, out_features=2, bias=True)
  )
  (additional_layers): ModuleList(
    (0-1): 2 x LSTM(256, 256)
  )
)

In [None]:
optimizer = torch.optim.Adam(ext_model.parameters(), lr = LEARNING_RATE)

In [26]:
trainer(ext_model, NUM_EPOCHS, DEVICE, train_loader, validation_loader, test_loader,
        optimizer)

Epoch: 001/015 | Batch 000/274 | Loss: 0.6925
Epoch: 001/015 | Batch 050/274 | Loss: 0.6974
Epoch: 001/015 | Batch 100/274 | Loss: 0.6814
Epoch: 001/015 | Batch 150/274 | Loss: 0.6921
Epoch: 001/015 | Batch 200/274 | Loss: 0.6900
Epoch: 001/015 | Batch 250/274 | Loss: 0.6893
Training accuracy: 52.69%
Validation accuracy: 51.21%
Time elapsed: 6.67 min
Epoch: 002/015 | Batch 000/274 | Loss: 0.6965
Epoch: 002/015 | Batch 050/274 | Loss: 0.6867
Epoch: 002/015 | Batch 100/274 | Loss: 0.6943
Epoch: 002/015 | Batch 150/274 | Loss: 0.6944
Epoch: 002/015 | Batch 200/274 | Loss: 0.6847
Epoch: 002/015 | Batch 250/274 | Loss: 0.6938
Training accuracy: 53.04%
Validation accuracy: 49.76%
Time elapsed: 13.32 min
Epoch: 003/015 | Batch 000/274 | Loss: 0.6651
Epoch: 003/015 | Batch 050/274 | Loss: 0.6815
Epoch: 003/015 | Batch 100/274 | Loss: 0.7039
Epoch: 003/015 | Batch 150/274 | Loss: 0.6871
Epoch: 003/015 | Batch 200/274 | Loss: 0.6929
Epoch: 003/015 | Batch 250/274 | Loss: 0.6847
Training accuracy

Saving the new model. Loading the new model below to avoid future training again

In [28]:
file_path = 'AmazonReviewLSTM.pth'
torch.save(ext_model.state_dict(), file_path)
print("Model saved successfully.")

Model saved successfully.


In [26]:
file_name = 'AmazonReviewLSTM.pth'
pth_file_path = os.path.join(path, file_name)
pretrained_model = PretrainedRNN(pretrained_model, 2)
pretrained_model.load_state_dict(torch.load(pth_file_path))
pretrained_model.to(DEVICE)

PretrainedRNN(
  (pretrained_model): RNN(
    (embedding): Embedding(20002, 128)
    (rnn): LSTM(128, 256)
    (fc): Linear(in_features=256, out_features=2, bias=True)
  )
  (additional_layers): ModuleList(
    (0-1): 2 x LSTM(256, 256)
  )
)

In [27]:
for param in pretrained_model.parameters():
    param.requires_grad = False

Evaluating and predicting with the new model

In [31]:
test_reviews = ["Mixed feelings about this.",
                "Genuinely impressed! Everything was perfect!",
                "I didn't like a single second of this movie.",
                "This movie bangs! It's awesome!"]

for idx, review in enumerate(test_reviews):
  pred = predict_sentiment(pretrained_model, review)
  print(review,  "\nProbability positive: " , f"{1 - pred:.2f}",
        "\nProbability Negative" , f"{pred:.2f}", "\n")

Mixed feelings about this. 
Probability positive:  0.25 
Probability Negative 0.75 

Genuinely impressed! Everything was perfect! 
Probability positive:  0.18 
Probability Negative 0.82 

I didn't like a single second of this movie. 
Probability positive:  0.74 
Probability Negative 0.26 

This movie bangs! It's awesome! 
Probability positive:  0.25 
Probability Negative 0.75 



In [73]:
print(f'Test accuracy: {compute_accuracy(pretrained_model, test_loader, DEVICE):.2f}%')

Test accuracy: 67.69%
