In [1]:
import pandas as pd

# Load the IMDB dataset (replace with your dataset path)
df = pd.read_csv("IMDB Dataset.csv")
# Inspect the dataset
print(df.head())

# Assume the dataset has 'review' and 'sentiment' columns
# Convert 'positive' to 1 and 'negative' to 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [2]:
# Sample a subset of 1000 examples (500 positive, 500 negative)
subset_size = 1000
df_subset = df.groupby('sentiment', group_keys=False).apply(lambda x: x.sample(subset_size // 2))

# Save the subset to a new CSV file
df_subset.to_csv("imdb_subset.csv", index=False)

# Inspect the subset
print(df_subset['sentiment'].value_counts())


sentiment
0    500
1    500
Name: count, dtype: int64


  df_subset = df.groupby('sentiment', group_keys=False).apply(lambda x: x.sample(subset_size // 2))


In [3]:
from sklearn.model_selection import train_test_split

# Split the subset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_subset['review'], df_subset['sentiment'], test_size=0.2, random_state=42
)


In [4]:
## Training a BERT models

from transformers import BertTokenizer


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenizing the data

def tokenize_data(texts):
    return tokenizer(
        texts.tolist(),
        max_length = 128,
        padding = "max_length",
        truncation = True,
        return_tensors = 'pt'
    )

train_encodings = tokenize_data(train_texts)
test_encodings = tokenize_data(test_texts)


In [6]:
import torch

class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

# Create datasets
train_dataset = IMDBDataset(train_encodings, train_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)


In [7]:
## Fine tuning the BERT model using Hugging face Trainer

from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [8]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.6678,0.653108
2,0.5328,0.502917
3,0.3851,0.395518


Attempted to log scalar metric loss:
0.6859
Attempted to log scalar metric grad_norm:
6.177656173706055
Attempted to log scalar metric learning_rate:
1.866666666666667e-05
Attempted to log scalar metric epoch:
0.2
Attempted to log scalar metric loss:
0.724
Attempted to log scalar metric grad_norm:
3.865417003631592
Attempted to log scalar metric learning_rate:
1.7333333333333336e-05
Attempted to log scalar metric epoch:
0.4
Attempted to log scalar metric loss:
0.7005
Attempted to log scalar metric grad_norm:
5.42045259475708
Attempted to log scalar metric learning_rate:
1.6000000000000003e-05
Attempted to log scalar metric epoch:
0.6
Attempted to log scalar metric loss:
0.6829
Attempted to log scalar metric grad_norm:
6.729391098022461
Attempted to log scalar metric learning_rate:
1.4666666666666666e-05
Attempted to log scalar metric epoch:
0.8
Attempted to log scalar metric loss:
0.6678
Attempted to log scalar metric grad_norm:
4.550769329071045
Attempted to log scalar metric learning

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Attempted to log scalar metric eval_loss:
0.3955175280570984
Attempted to log scalar metric eval_runtime:
66.4223
Attempted to log scalar metric eval_samples_per_second:
3.011
Attempted to log scalar metric eval_steps_per_second:
0.376
Attempted to log scalar metric epoch:
3.0
Attempted to log scalar metric train_runtime:
3470.6258
Attempted to log scalar metric train_samples_per_second:
0.692
Attempted to log scalar metric train_steps_per_second:
0.043
Attempted to log scalar metric total_flos:
157866633216000.0
Attempted to log scalar metric train_loss:
0.5614295546213786
Attempted to log scalar metric epoch:
3.0


TrainOutput(global_step=150, training_loss=0.5614295546213786, metrics={'train_runtime': 3470.6258, 'train_samples_per_second': 0.692, 'train_steps_per_second': 0.043, 'total_flos': 157866633216000.0, 'train_loss': 0.5614295546213786, 'epoch': 3.0})

In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
predictions = trainer.predict(test_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Evaluate
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Test Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(test_labels, predicted_labels))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.84        96
           1       0.87      0.82      0.84       104

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.84      0.84      0.84       200



In [15]:
# Save model and tokenizer
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")

('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.txt',
 './results\\added_tokens.json')

In [18]:
pd.set_option('display.max_colwidth', None)  # No truncation for column values
pd.set_option('display.float_format', '{:.6f}'.format)

In [20]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

# Load the trained model and tokenizer
model_path = "./results"  # Replace with the directory where your BERT model is saved
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

# Input reviews
new_review_raw = [
    "Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",
    "Best movie ever! I loved wasting 3 hours of my life on this masterpiece.",
    "Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",
    "Definitely recommend this movie... if you want to bore yourself to death.",
    "The movie was about two friends who embark on a journey. It has a runtime of two hours.",
    "It is a typical superhero movie with action scenes and some emotional moments.",
    "The cinematography was colorful, and the soundtrack was loud.",
    "The second half of the movie was longer than the first.",
    "It was okay, I guess, but I wouldn’t watch it again.",
    "Not bad, but not great either.",
    "I laughed, I cried, but I still don’t know if I liked it or not.",
    "The second half was much better than the first, though the ending was questionable.",
    "It was very good, super, fantastic.",
    "It was good until the second half",
    "It was second half",
    "Second half was good",
    "Movie is amazing, especially in the Second half",
    "Terrible movie, Second half was hilarious",
    "It was okay, Second half was hilarious",
    "Best movie if you are looking for a headache",
    "Lots of fun"
]

# Tokenize the reviews
tokenized_inputs = tokenizer(
    new_review_raw,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Predict probabilities
with torch.no_grad():
    outputs = model(**tokenized_inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).numpy()

# Create a DataFrame for results
validationResults = pd.DataFrame()
validationResults['Test cases'] = new_review_raw
validationResults['Negative Probabilities'] = probabilities[:, 0]  # Negative class probabilities
validationResults['Positive Probabilities'] = probabilities[:, 1]  # Positive class probabilities

# Add predictions based on the threshold
validationResults['Prediction'] = np.where(validationResults['Positive Probabilities'] > 0.5, "Positive", "Negative")

# Display the results
validationResults


Unnamed: 0,Test cases,Negative Probabilities,Positive Probabilities,Prediction
0,"Wow, this movie was just what I needed to cure my insomnia. Absolutely thrilling!",0.450943,0.549057,Positive
1,Best movie ever! I loved wasting 3 hours of my life on this masterpiece.,0.272137,0.727863,Positive
2,"Oh sure, the acting was so 'natural' I almost believed the actors were wooden dolls.",0.628583,0.371417,Negative
3,Definitely recommend this movie... if you want to bore yourself to death.,0.606512,0.393488,Negative
4,The movie was about two friends who embark on a journey. It has a runtime of two hours.,0.425967,0.574033,Positive
5,It is a typical superhero movie with action scenes and some emotional moments.,0.217186,0.782814,Positive
6,"The cinematography was colorful, and the soundtrack was loud.",0.496208,0.503792,Positive
7,The second half of the movie was longer than the first.,0.698156,0.301844,Negative
8,"It was okay, I guess, but I wouldn’t watch it again.",0.747976,0.252024,Negative
9,"Not bad, but not great either.",0.749035,0.250965,Negative
