## <span style="color:blue;">Notebook Contents</span>

### 1. Set up
  1. **Loading packages**
  2. **Connecting with GPUs**
### 2. Data Preparation
  1. **Loading data**
  2. **Data cleaning**
  3. **Creating DataLoaders**
### 3. Transfer learning using BERT pre-trained model
  1. **Loading the model**
  2. **Model finetuning  and training**
### 4. Model evaluation
  1. **Model evaluation on test data**
  
<hr>

## 1. Set Up

### Loading packages

In [1]:
# utilities
import os
import shutil
import time
from collections import Counter

#numpy and pandas
import numpy as np
import pandas as pd

# sklearn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# pytorch packages
import torch
from torch import tensor
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.optim as optim

# visualization
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
from PIL import Image
import numpy as np

# for reproducability
torch.manual_seed(42)
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
print(torch.__version__)

2.5.1


### Connecting to GPU/MPS

<div style="color:red; font-size:16px; background-color:yellow;">RUN THIS BELOW BLOCK ONLY ON MACBOOK FOR A LOCAL INSTANCE</div>


In [4]:
# Run locally if Macbook has a GPU
# Is MPS even available? macOS 12.3+

print(torch.backends.mps.is_available())

# Was the current version of PyTorch built with MPS activated?
print(torch.backends.mps.is_built())

dtype = torch.float
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using GPU: Metal Performance Shaders (MPS)")
else:
    device = torch.device('cpu')
    print("Using CPU")

# Tensor creation
x = tensor([1.0, 2.0, 3.0], device=device, dtype=dtype)
print(f"Tensor: {x}, Device: {x.device}")

True
True
Using GPU: Metal Performance Shaders (MPS)
Tensor: tensor([1., 2., 3.], device='mps:0'), Device: mps:0


## 2. Data Preparation

### Loading the data

In [9]:
!unzip data.zip -d data

Archive:  data.zip
  inflating: data/test.csv           
  inflating: data/testdata.manual.2009.06.14.csv  
  inflating: data/train.csv          
  inflating: data/training.1600000.processed.noemoticon.csv  


In [12]:
df_train = pd.read_csv("./data/train.csv", encoding="latin1")
df_test = pd.read_csv("./data/test.csv", encoding="latin1")

### Data Cleaning

In [35]:
def clean_data(df_train, df_test):
    # Define sentiment to label mapping
    labels = {'positive': 2, 'neutral': 1, 'negative': 0}
    
    # Extract relevant columns
    train = pd.DataFrame(df_train[['text', 'sentiment']])
    test = pd.DataFrame(df_test[['text', 'sentiment']])
    
    # Print initial shapes
    print("Initial data size:")
    print("Train:", train.shape)
    print("Test :", test.shape)
    
    # Check and print rows with NaNs
    print("\nRows with NaN in train:")
    print(train[train.isna().any(axis=1)])
    
    print("\nRows with NaN in test:")
    print(test[test.isna().any(axis=1)])
    
    # Drop rows with NaN in either 'text' or 'sentiment'
    train.dropna(subset=['text', 'sentiment'], inplace=True)
    test.dropna(subset=['text', 'sentiment'], inplace=True)
    
    # Print shapes after dropping
    print("\nData size after dropping NaNs:")
    print("Train:", train.shape)
    print("Test :", test.shape)
    
    # Map sentiment labels
    train['label'] = train['sentiment'].map(labels).astype('Int64')
    test['label'] = test['sentiment'].map(labels).astype('Int64')
    
    return train, test

In [52]:
train_clean, test_clean = clean_data(df_train, df_test)

Initial data size:
Train: (27481, 2)
Test : (4815, 2)

Rows with NaN in train:
    text sentiment
314  NaN   neutral

Rows with NaN in test:
     text sentiment
3534  NaN       NaN
3535  NaN       NaN
3536  NaN       NaN
3537  NaN       NaN
3538  NaN       NaN
...   ...       ...
4810  NaN       NaN
4811  NaN       NaN
4812  NaN       NaN
4813  NaN       NaN
4814  NaN       NaN

[1281 rows x 2 columns]

Data size after dropping NaNs:
Train: (27480, 2)
Test : (3534, 2)


### Creating DataLoaders

In [37]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = list(texts)   # Ensures compatibility even if passed as Series
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [39]:
# Constants
MODEL_NAME = "bert-base-uncased"  # Pre-trained model to use
MAX_LEN = 128  # Maximum sequence length
BATCH_SIZE = 16  # Batch size for training and evaluation

In [40]:
def create_data_loaders(train_data, test_data, model_name=MODEL_NAME, max_len=MAX_LEN, batch_size=BATCH_SIZE):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Convert DataFrame columns to lists
    train_texts = train_data['text'].tolist()
    train_labels = train_data['label'].tolist()
    test_texts = test_data['text'].tolist()
    test_labels = test_data['label'].tolist()

    # Create Dataset instances
    train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_length=max_len)
    test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, max_length=max_len)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, test_loader

In [53]:
train_loader, test_loader = create_data_loaders(train_clean, test_clean)

## 3. Transfer learning using BERT pre-trained model

### Loading the model

In [43]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3  # 3 sentiment classes: negative, neutral, positive
)
model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Setting up hyper-parameters

In [48]:
# Hyperparameters
EPOCHS = 2  # Number of training epochs
LEARNING_RATE = 2e-5  # Learning rate for fine-tuning

### Setting up the optimizer

In [49]:
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

### Model training and fine-tuning

In [57]:
def train_model(model, train_loader, optimizer, device, epochs):   
    model.to(device)

    for epoch in range(epochs):
        model.train()
        start_time = time.time()
        total_loss = 0.0
        total_batches = 0
        all_preds = []
        all_labels = []

        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            total_batches += 1

            # Collect predictions and labels
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            if (batch_idx + 1) % 500 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.6f}")

        avg_loss = total_loss / total_batches
        print()
        print(f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_loss:.6f}")
        print("-" * 50)

    
    # Overall Accuracy
    acc = accuracy_score(all_labels, all_preds)
    print(f"Overall Accuracy: {acc * 100:.2f}%")

    # Classification Report
    label_names = ["negative", "neutral", "positive"]
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=label_names))

    total_time = time.time() - start_time
    print(f"\nTraining completed in: {total_time:.2f} seconds")

    return model
   

In [58]:
trained_model = train_model(model, train_loader, optimizer, device, EPOCHS)

Epoch 1/2, Batch 500/1718, Loss: 0.091535
Epoch 1/2, Batch 1000/1718, Loss: 0.282913
Epoch 1/2, Batch 1500/1718, Loss: 0.389963

Epoch 1/2, Average Training Loss: 0.267246
--------------------------------------------------
Epoch 2/2, Batch 500/1718, Loss: 0.139332
Epoch 2/2, Batch 1000/1718, Loss: 0.037191
Epoch 2/2, Batch 1500/1718, Loss: 0.180195

Epoch 2/2, Average Training Loss: 0.157689
--------------------------------------------------
Overall Accuracy: 94.49%

Classification Report:
              precision    recall  f1-score   support

    negative       0.95      0.95      0.95      7781
     neutral       0.93      0.94      0.94     11117
    positive       0.95      0.95      0.95      8582

    accuracy                           0.94     27480
   macro avg       0.95      0.95      0.95     27480
weighted avg       0.94      0.94      0.94     27480


Training completed in: 421.34 seconds


## 4. Model Evaluation

In [55]:
def evaluate_model(model, test_loader, device):
    model.eval()
    model.to(device)
    start_time = time.time()

    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, dim=1)

            true_labels.extend(labels.tolist())
            predictions.extend(predicted.cpu().tolist())

    # Overall Accuracy
    acc = accuracy_score(true_labels, predictions)
    print(f"Overall Evaluation Accuracy: {acc * 100:.2f}%")

    # Classification report
    label_names = ["negative", "neutral", "positive"]
    print("\n Classification Report on test data:")
    print(classification_report(true_labels, predictions, target_names=label_names))

    total_time = time.time() - start_time
    print(f"\nTraining completed in: {total_time:.2f} seconds")

In [59]:
evaluate_model(trained_model, test_loader, device)

Overall Evaluation Accuracy: 78.32%

 Classification Report on test data:
              precision    recall  f1-score   support

    negative       0.76      0.80      0.78      1001
     neutral       0.75      0.74      0.75      1430
    positive       0.85      0.82      0.83      1103

    accuracy                           0.78      3534
   macro avg       0.79      0.79      0.79      3534
weighted avg       0.78      0.78      0.78      3534


Training completed in: 14.90 seconds
