<a href="https://colab.research.google.com/github/Arnabpauljohn/Railway-Management-System/blob/main/Movie%20review%20sentiment%20analysis%20using%20DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets -q

In [None]:
# Import libraries
import os
os.environ["WANDB_DISABLED"] = "true"
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# Load datasets
imdb_df = pd.read_csv("/content/IMDB Dataset.csv")
rotten_df = pd.read_csv("/content/rotten_tomatoes_movie_reviews.csv")

In [None]:
# Prepare datasets
imdb_df = imdb_df[['review', 'sentiment']].copy()
imdb_df['label'] = imdb_df['sentiment'].map({'positive':1, 'negative':0})
imdb_df.drop(columns=['sentiment'], inplace=True)

rotten_df = rotten_df[['reviewText', 'scoreSentiment']].copy()
rotten_df = rotten_df[rotten_df['scoreSentiment'].isin(['positive', 'negative'])]  # binary only
rotten_df['label'] = rotten_df['scoreSentiment'].map({'positive':1, 'negative':0})
rotten_df.rename(columns={'reviewText':'review'}, inplace=True)
rotten_df.drop(columns=['scoreSentiment'], inplace=True)


In [None]:
# Combine datasets
combined_df = pd.concat([imdb_df, rotten_df], ignore_index=True)
combined_df.dropna(subset=['review', 'label'], inplace=True)
combined_df['label'] = combined_df['label'].astype(int)

In [None]:
# Sample 2000 rows randomly for faster training
combined_df = combined_df.sample(n=2000, random_state=42).reset_index(drop=True)

In [None]:
# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    combined_df['review'].astype(str).tolist(),
    combined_df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
# Load DistilBERT tokenizer & tokenize
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# Dataset class
class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = MovieDataset(train_encodings, train_labels)
val_dataset = MovieDataset(val_encodings, val_labels)

In [None]:
# Load DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Metrics function
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# Train
trainer.train()

Step,Training Loss
10,0.7169
20,0.6903
30,0.6275
40,0.468
50,0.5346
60,0.289
70,0.6201
80,0.3946
90,0.3847
100,0.5691


TrainOutput(global_step=400, training_loss=0.46658703256398437, metrics={'train_runtime': 85.7047, 'train_samples_per_second': 18.669, 'train_steps_per_second': 4.667, 'total_flos': 211947837849600.0, 'train_loss': 0.46658703256398437, 'epoch': 1.0})

In [None]:
# Evaluate
eval_results = trainer.evaluate()
print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

Evaluation Results:
eval_loss: 0.3625580668449402
eval_accuracy: 0.89
eval_f1: 0.8854166666666666
eval_precision: 0.8808290155440415
eval_recall: 0.8900523560209425
eval_runtime: 5.7055
eval_samples_per_second: 70.108
eval_steps_per_second: 17.527
epoch: 1.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
save_path = "/content/drive/MyDrive/bert_saved_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model & tokenizer saved to {save_path}")

Model & tokenizer saved to /content/drive/MyDrive/bert_saved_model


In [None]:
import os

if os.path.exists(save_path):
    print("Saved files:")
    print(os.listdir(save_path))
else:
    print("Model path not found.")


Saved files:
['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt', 'tokenizer.json']


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax().item()

    if predicted_class_id == 1:
        return "Positive"
    else:
        return "Negative"

user_input = input("Enter a movie review: ")
print("Sentiment:", predict_sentiment(user_input))


Enter a movie review: "A masterfully crafted drama that will leave you breathless. [Actor's name] delivers a stunning performance, and the direction is simply breathtaking. The film explores complex themes with nuance and grace, making it a must-watch for anyone who appreciates quality cinema."
Sentiment: Positive


In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [2]:
# 2. Load Model & Tokenizer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load model and tokenizer from drive
model_path = "/content/drive/MyDrive/bert_saved_model"

tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
#model = DistilBertForSequenceClassification.from_pretrained(model_path, map_location=torch.device('cpu'))


In [11]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax().item()

    if predicted_class_id == 1:
        return "Positive"
    else:
        return "Negative"

# Input
user_input = input("Enter a movie review: ")
print("Sentiment:", predict_sentiment(user_input))

Enter a movie review: This movie is a real gem! The script is clever, the performances are outstanding, and the cinematography is breathtaking. I was completely captivated from beginning to end. If you're looking for a film that will move you and stay with you, this is it
Sentiment: Positive
