In [2]:
# Load the IMDB dataset from the CSV file
#Code Explanation
#This code snippet loads the IMDB dataset from a CSV file into a pandas DataFrame and displays its shape, columns, and the first few rows.
#-Import necessary libraries
#- Load the IMDB dataset from a CSV file into a DataFrame
#- Print a success message
#- Print the shape of the DataFrame
#- Print the column names of the DataFrame
#- Print the first few rows of the DataFrame"
%pip install transformers datasets torch scikit-learn matplotlib seaborn tqdm
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm import tqdm

# Load the dataset
df = pd.read_csv('imdb.csv')
print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\
First few rows:")
print(df.head())



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable



  from .autonotebook import tqdm as notebook_tqdm


Dataset loaded successfully!
Dataset shape: (50000, 2)
Columns: ['review', 'sentiment']
First few rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
#Code Explanation The code snippet cleans text data, converts sentiment labels to numeric values, and splits the dataset into training and testing subsets.
#- Define a function to clean text by removing HTML tags and extra whitespace.
#- Apply the cleaning function to the review column of the DataFrame.
#- Map sentiment labels to numeric values (1 for positive, 0 for negative).
#- Sample a subset of 10,000 reviews for faster training.
#- Split the subset into training and testing sets while maintaining label distribution.
# Clean and prepare the data
import re

def clean_text(text):
    """Clean HTML tags and extra whitespace from text"""
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Clean the reviews
df['review'] = df['review'].apply(clean_text)

# Convert sentiment labels to numeric
df['labels'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Take a smaller subset for faster training (10,000 samples)
df_subset = df.sample(n=10000, random_state=42).reset_index(drop=True)

print(f"Using subset of {len(df_subset)} samples")
print(f"Label distribution:")
print(df_subset['labels'].value_counts())

# Split into train and test
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_subset['review'].tolist(),
    df_subset['labels'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_subset['labels']
)

print(f"Training samples: {len(train_texts)}")
print(f"Test samples: {len(test_texts)}")

Using subset of 10000 samples
Label distribution:
labels
1    5039
0    4961
Name: count, dtype: int64
Training samples: 8000
Test samples: 2000


In [4]:
#Code Explanation
#The code snippet initializes a DistilBERT model and tokenizer for binary sentiment classification and tokenizes the training and test datasets.
#- Set model name to 'distilbert-base-uncased'.
#- Load tokenizer and model for sequence classification.
#- Define a function to tokenize text data with specified maximum length.
#- Tokenize training and test data, storing input IDs, attention masks, and labels.
#- Print shapes of tokenized training and test inputs.

import os

# Unset CA bundle environment variables if set (fixes TLS CA certificate bundle error)
os.environ.pop("REQUESTS_CA_BUNDLE", None)
os.environ.pop("CURL_CA_BUNDLE", None)

# Initialize tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2,
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1}
)

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

# Tokenize the data
def tokenize_data(texts, labels, max_length=256):
    """Tokenize text data"""
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': torch.tensor(labels)
    }

print("Tokenizing training data...")
train_encodings = tokenize_data(train_texts, train_labels)
print("Tokenizing test data...")
test_encodings = tokenize_data(test_texts, test_labels)

print(f"Training input shape: {train_encodings['input_ids'].shape}")
print(f"Test input shape: {test_encodings['input_ids'].shape}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: distilbert-base-uncased
Model parameters: 66,955,010
Tokenizing training data...
Tokenizing test data...
Training input shape: torch.Size([8000, 256])
Test input shape: torch.Size([2000, 256])


In [5]:
#This code defines a custom dataset for the IMDB movie reviews and creates data loaders for training and testing.
#- Defined IMDBDataset class inheriting from torch.utils.data.Dataset
#- Implemented __getitem__ and __len__ methods for dataset functionality
#- Created train and test datasets using IMDBDataset
#- Initialized data loaders for both datasets with specified batch sizes and shuffling options

# Create PyTorch datasets
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = IMDBDataset(train_encodings)
test_dataset = IMDBDataset(test_encodings)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

print("Data loaders created successfully!")

Train dataset size: 8000
Test dataset size: 2000
Data loaders created successfully!


In [6]:
#This code snippet evaluates the performance of a model on a test dataset before fine-tuning by calculating accuracy, F1 score, and average loss.
# Set model to evaluation mode
#- Initialize lists for predictions and true labels
#- Loop through data loader batches
#- Get model outputs and calculate loss
#- Store predictions and true labels
#- Calculate accuracy and F1 score
#- Return accuracy, F1 score, and average loss

# Evaluate baseline model performance before fine-tuning
def evaluate_model(model, data_loader):
    """Evaluate model performance"""
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            total_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    avg_loss = total_loss / len(data_loader)
    
    return accuracy, f1, avg_loss

print("Evaluating baseline model performance...")
baseline_accuracy, baseline_f1, baseline_loss = evaluate_model(model, test_loader)

print(f"\
Baseline Performance (before fine-tuning):")
print(f"Accuracy: {baseline_accuracy:.4f}")
print(f"F1 Score: {baseline_f1:.4f}")
print(f"Average Loss: {baseline_loss:.4f}")

Evaluating baseline model performance...


Evaluating: 100%|██████████| 125/125 [25:09<00:00, 12.08s/it]


Baseline Performance (before fine-tuning):
Accuracy: 0.5040
F1 Score: 0.6702
Average Loss: 0.6946


In [7]:
#This code snippet implements a manual training loop to fine-tune a model using a specified number of epochs and learning rate.
#- Defined a function to train the model with training and evaluation phases.
#- Set up an optimizer with weight decay.
#- Iterated over epochs to train the model, calculating and storing training loss.
#- Evaluated the model after each epoch, calculating accuracy and F1 score.
#- Printed training and evaluation metrics for each epoch.

# Fine-tune the model
def train_model(model, train_loader, test_loader, epochs=2, learning_rate=2e-5):
    """Train the model with manual training loop"""
    
    # Set up optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    train_losses = []
    test_accuracies = []
    test_f1_scores = []
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')
        
        for batch in progress_bar:
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            loss = outputs.loss
            total_train_loss += loss.item()
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Evaluation phase
        test_accuracy, test_f1, test_loss = evaluate_model(model, test_loader)
        test_accuracies.append(test_accuracy)
        test_f1_scores.append(test_f1)
        
        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'  Train Loss: {avg_train_loss:.4f}')
        print(f'  Test Loss: {test_loss:.4f}')
        print(f'  Test Accuracy: {test_accuracy:.4f}')
        print(f'  Test F1: {test_f1:.4f}')
        print('-' * 50)
    
    return train_losses, test_accuracies, test_f1_scores

print("Starting fine-tuning...")
train_losses, test_accuracies, test_f1_scores = train_model(model, train_loader, test_loader, epochs=2)

print("\
Fine-tuning completed!")

Starting fine-tuning...


Epoch 1/2:   0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1/2: 100%|██████████| 500/500 [2:10:28<00:00, 15.66s/it, loss=0.2005]   
Evaluating: 100%|██████████| 125/125 [06:15<00:00,  3.01s/it]


Epoch 1/2:
  Train Loss: 0.3323
  Test Loss: 0.2741
  Test Accuracy: 0.8845
  Test F1: 0.8804
--------------------------------------------------


Epoch 2/2: 100%|██████████| 500/500 [1:17:53<00:00,  9.35s/it, loss=0.1293]
Evaluating: 100%|██████████| 125/125 [06:18<00:00,  3.03s/it]

Epoch 2/2:
  Train Loss: 0.1742
  Test Loss: 0.2803
  Test Accuracy: 0.8865
  Test F1: 0.8930
--------------------------------------------------
Fine-tuning completed!



