In [2]:
pip install torch transformers pandas numpy scikit-learn matplotlib




In [3]:
import matplotlib.pyplot as plt
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm


In [5]:
df = pd.read_csv("/IMDB Dataset.csv")

In [6]:
df = df.sample(1000, random_state=42).reset_index(drop=True)

In [7]:
print(df.head())

                                              review sentiment
0  I really liked this Summerslam due to the look...  positive
1  Not many television shows appeal to quite as m...  positive
2  The film quickly gets to a major chase scene w...  negative
3  Jane Austen would definitely approve of this o...  positive
4  Expectations were somewhat high for me when I ...  negative


In [10]:
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["sentiment"])


In [12]:
print(df.head())

                                              review sentiment  label
0  I really liked this Summerslam due to the look...  positive      1
1  Not many television shows appeal to quite as m...  positive      1
2  The film quickly gets to a major chase scene w...  negative      0
3  Jane Austen would definitely approve of this o...  positive      1
4  Expectations were somewhat high for me when I ...  negative      0


In [13]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["review"].values, df["label"].values, test_size=0.2, random_state=42
)

In [15]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [16]:
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
epochs = 2
model.train()

for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 100/100 [08:40<00:00,  5.20s/it, loss=0.219]
Epoch 2: 100%|██████████| 100/100 [08:29<00:00,  5.10s/it, loss=0.196]


In [19]:
model.eval()
predictions, real_values = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        real_values.extend(labels.cpu().numpy())

print("\nAccuracy:", accuracy_score(real_values, predictions))
print("\nClassification Report:\n", classification_report(real_values, predictions, target_names=label_encoder.classes_))


Accuracy: 0.795

Classification Report:
               precision    recall  f1-score   support

    negative       0.76      0.93      0.84       114
    positive       0.87      0.62      0.72        86

    accuracy                           0.80       200
   macro avg       0.82      0.77      0.78       200
weighted avg       0.81      0.80      0.79       200



In [20]:
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")

print("\n✅ Model Training Complete & Saved at 'sentiment_model/'")


✅ Model Training Complete & Saved at 'sentiment_model/'


In [21]:
def predict_sentiment(text):
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()
        return label_encoder.inverse_transform([prediction])[0]

# Test prediction
print("\nExample Prediction:", predict_sentiment("This movie was absolutely amazing!"))


Example Prediction: positive
