In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


# 1. Dataset Preparation (10%)

In [3]:
# Define correct column names
column_names = ["sentence", "label"]

def load_dataset1(train_url, test_url):
    train_df = pd.read_csv(train_url, sep='\t', names=column_names, header=None)
    test_df = pd.read_csv(test_url, sep='\t', names=column_names, header=None)
    return train_df, test_df

# URLs for SST2 dataset
train_url = "https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv"
test_url = "https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv"

# Load datasets
train_df, test_df = load_dataset1(train_url, test_url)

# Display first few rows to confirm correct loading
print(train_df.head())


# Use the 'sentence' column since SST2 uses it instead of 'text'
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['sentence'], train_df['label'], test_size=0.2, random_state=42
)
print("\n")
print(f"Number of training samples: {len(train_texts)}")
print(f"Number of validation samples: {len(val_texts)}")
print(f"Number of testing samples: {len(test_df)}")

                                            sentence  label
0  a stirring , funny and finally transporting re...      1
1  apparently reassembled from the cutting room f...      0
2  they presume their audience wo n't sit still f...      0
3  this is a visually stunning rumination on love...      1
4  jonathan parker 's bartleby should have been t...      1


Number of training samples: 5536
Number of validation samples: 1384
Number of testing samples: 1821


# 2. Construct a Multi-Layer Perceptron (MLP) model. (20%)


In [4]:
import torch
import torch.nn as nn

In [5]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2)  #output with 2 labels as speciied
        )

    def forward(self, x):
        return self.model(x)

# Example input size (to be updated based on feature extraction method)
input_size = 10000  #based on the provided model architecture. 

# Initialize model
mlp_model_ = MLPClassifier(input_size)
print(mlp_model_)

# Count trainable parameters
print("\n\n############### Parameters ############### ")
total_params = sum(p.numel() for p in mlp_model_.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {total_params}")


MLPClassifier(
  (model): Sequential(
    (0): Linear(in_features=10000, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU()
    (8): Linear(in_features=64, out_features=2, bias=True)
  )
)


############### Parameters ############### 
Total Trainable Parameters: 5293122


# 3. Case 1: Implement Bag-of-Words (BoW)

BoW is a text representation technique where a document is converted into a vector based on word frequency, ignoring word order and semantics. Each unique word in the vocabulary becomes a feature, and its value represents the number of times it appears in the document.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:

# Create Bag-of-Words vectorizer
vectorizer = CountVectorizer(max_features=10000)
#text to vector operator, limits vocab to keep the 10k most frequent words 

# Fit and transform the text data
X_train_bow = vectorizer.fit_transform(train_texts).toarray()
X_val_bow = vectorizer.transform(val_texts).toarray()
#vocabulary is learnt 

# Convert to PyTorch tensors
X_train_bow = torch.tensor(X_train_bow, dtype=torch.float32)
X_val_bow = torch.tensor(X_val_bow, dtype=torch.float32)
y_train = torch.tensor(train_labels.values, dtype=torch.long)
y_val = torch.tensor(val_labels.values, dtype=torch.long)
#converting np array to torch tensor 

print(f"BoW Feature Shape: {X_train_bow.shape}")
# (num_samples, vocab_size)

BoW Feature Shape: torch.Size([5536, 10000])


#  Case 2: Implement LLaMA-3.1 Embeddings

In [8]:
# !pip install -U "huggingface_hub[cli]"

In [9]:
# !huggingface-cli login

In [10]:
# Define model name
model_name = "google-bert/bert-base-uncased"

# Load tokenizer and model on CPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.pad_token if tokenizer.pad_token else "[PAD]"
bert_model = AutoModel.from_pretrained(model_name).to("cpu")

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to("cpu")

    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Mean pooling over the sequence dimension
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

# # Function to get embeddings
# def get_bert_embeddings(texts, batch_size=8):
#     all_embeddings = []
    
#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i : i + batch_size]
#         inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True).to("cpu")
        
#         with torch.no_grad():
#             outputs = bert_model(**inputs)
        
#         batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
#         all_embeddings.append(batch_embeddings)

#     return np.vstack(all_embeddings)

# Example usage
text = "Implement case 2: Construct a function to use LLaMa-3.1 embeddings."
embedding = get_bert_embeddings(text)
print("Embedding shape:", embedding.shape)  # (1, hidden_size)

Embedding shape: (768,)


# 5. Train the model with 10 epochs and create the best-performing model (checkpoint.pt) on the Dataset 1. (10%)

In [11]:
from torch.utils.data import DataLoader, TensorDataset

## BoW training

In [12]:
# Define DataLoaders
batch_size = 32
train_dataset = TensorDataset(X_train_bow, y_train)
val_dataset = TensorDataset(X_val_bow, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [13]:
import os

In [14]:
# Create a directory to save models
model_dir = "saved_models_bow"
os.makedirs(model_dir, exist_ok=True)

In [15]:
input_size = 10000
mlp_model_ = MLPClassifier(input_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model_.parameters(), lr=0.001)

# Training loop
num_epochs = 10
best_acc = 0.0  # Track the best validation accuracy

for epoch in range(num_epochs):
    mlp_model_.train()
    total_loss = 0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = mlp_model_(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation phase
    mlp_model_.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = mlp_model_(X_batch)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    val_acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}, Val Accuracy: {val_acc:.4f}")

    # Save best model
    # if val_acc > best_acc:
    #     best_acc = val_acc
    #     torch.save(model.state_dict(), f"checkpoint_BoW_{best_acc}.pt")

    model_path = os.path.join(model_dir, f"MLP_BoW_E{epoch+1}_Acc{val_acc:.4f}.pt")

    # Save best model
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(mlp_model_.state_dict(), model_path)
        print(f"Best model saved: {model_path}")

print(f"\nBest Validation Accuracy: {best_acc:.4f}")

Epoch 1/10, Loss: 99.1350, Val Accuracy: 0.7912
Best model saved: saved_models_bow/MLP_BoW_E1_Acc0.7912.pt
Epoch 2/10, Loss: 35.9472, Val Accuracy: 0.7955
Best model saved: saved_models_bow/MLP_BoW_E2_Acc0.7955.pt
Epoch 3/10, Loss: 7.0539, Val Accuracy: 0.7847
Epoch 4/10, Loss: 1.6393, Val Accuracy: 0.7970
Best model saved: saved_models_bow/MLP_BoW_E4_Acc0.7970.pt
Epoch 5/10, Loss: 0.3940, Val Accuracy: 0.7876
Epoch 6/10, Loss: 0.2624, Val Accuracy: 0.7811
Epoch 7/10, Loss: 0.4530, Val Accuracy: 0.7999
Best model saved: saved_models_bow/MLP_BoW_E7_Acc0.7999.pt
Epoch 8/10, Loss: 0.4430, Val Accuracy: 0.8006
Best model saved: saved_models_bow/MLP_BoW_E8_Acc0.8006.pt
Epoch 9/10, Loss: 0.3663, Val Accuracy: 0.7955
Epoch 10/10, Loss: 0.0162, Val Accuracy: 0.7941

Best Validation Accuracy: 0.8006


## Bert-Base-Uncased Training 

In [16]:
model_dir_two = "saved_models_bert"
os.makedirs(model_dir_two, exist_ok=True)

In [17]:
from tqdm import tqdm

### Making Embeddings 

We are generating text embeddings using a pre-trained BERT model and preparing them for training a machine learning model. First, We load the tokenizer and model, ensuring that a valid padding token is set. Then, We define a function to convert input text into numerical embeddings by tokenizing the text, passing it through the model, and averaging the hidden states. Using `tqdm`, We apply this function to Wer training and validation text datasets while displaying a progress bar. Finally, We convert the generated embeddings and labels into PyTorch tensors and create `DataLoader` objects, which allow efficient batch processing during model training.

In [18]:
# Convert training data to embeddings with progress bar
train_embeddings = [get_bert_embeddings(text) for text in tqdm(train_texts, desc="Processing Train Embeddings")]
val_embeddings = [get_bert_embeddings(text) for text in tqdm(val_texts, desc="Processing Validation Embeddings")]

# Convert lists to NumPy arrays
train_embeddings = np.array(train_embeddings)  # Shape: (num_train_samples, 768)
val_embeddings = np.array(val_embeddings)      # Shape: (num_val_samples, 768)

# Convert labels to tensors
y_train = torch.tensor(train_labels.values, dtype=torch.long)
y_val = torch.tensor(val_labels.values, dtype=torch.long)

# Convert embeddings to PyTorch tensors
X_train = torch.tensor(train_embeddings, dtype=torch.float32)
X_val = torch.tensor(val_embeddings, dtype=torch.float32)

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing Train Embeddings: 100%|██████████| 5536/5536 [02:19<00:00, 39.58it/s]
Processing Validation Embeddings: 100%|██████████| 1384/1384 [00:34<00:00, 40.08it/s]


In [19]:
# Define the folder path
save_dir = "saved_embeddings_bert"
os.makedirs(save_dir, exist_ok=True)  # Create the folder if it doesn't exist

# Save embeddings
torch.save(X_train, os.path.join(save_dir, 'X_train.pt'))
torch.save(X_val, os.path.join(save_dir, 'X_val.pt'))

In [20]:
print(train_embeddings.shape)  # Should be (num_samples, feature_dim)

(5536, 768)


In [21]:
import time 

In [22]:
# Move model to device
device = "cpu"

# Initialize model
input_size = X_train.shape[-1]  # Ensures correct shape
mlp_model_bert = MLPClassifier(input_size).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model_bert.parameters(), lr=0.001)

# Training loop
best_val_acc = 0.0
num_epochs = 10

for epoch in range(num_epochs):
    start_time = time.time()
    mlp_model_bert.train()
    total_loss, correct, total = 0, 0, 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move data to device
        optimizer.zero_grad()

        outputs = mlp_model_bert(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    train_acc = correct / total
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Loss: {total_loss:.4f}, Train Acc: {train_acc:.4f}")

    # Validation step
    mlp_model_bert.eval()
    correct, total = 0, 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move data to device
            outputs = mlp_model_bert(X_batch)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

    val_acc = correct / total
    print(f"Validation Accuracy: {val_acc:.4f}")

    model_path = os.path.join(model_dir_two, f"MLP_BERT_E{epoch+1}_Acc{val_acc:.4f}.pt")

    # Save best model
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(mlp_model_.state_dict(), model_path)
        print(f"Best model saved: {model_path}")


print(f"Best Validation Accuracy: {best_val_acc:.4f}")

Epoch 1/10, Time: 0.19s, Loss: 73.3209, Train Acc: 0.7959
Validation Accuracy: 0.8562
Best model saved: saved_models_bert/MLP_BERT_E1_Acc0.8562.pt
Epoch 2/10, Time: 0.18s, Loss: 59.3368, Train Acc: 0.8515
Validation Accuracy: 0.8490
Epoch 3/10, Time: 0.18s, Loss: 54.9672, Train Acc: 0.8629
Validation Accuracy: 0.8490
Epoch 4/10, Time: 0.19s, Loss: 50.8761, Train Acc: 0.8712
Validation Accuracy: 0.8605
Best model saved: saved_models_bert/MLP_BERT_E4_Acc0.8605.pt
Epoch 5/10, Time: 0.17s, Loss: 46.9093, Train Acc: 0.8844
Validation Accuracy: 0.8663
Best model saved: saved_models_bert/MLP_BERT_E5_Acc0.8663.pt
Epoch 6/10, Time: 0.17s, Loss: 42.8001, Train Acc: 0.8929
Validation Accuracy: 0.8671
Best model saved: saved_models_bert/MLP_BERT_E6_Acc0.8671.pt
Epoch 7/10, Time: 0.18s, Loss: 38.4778, Train Acc: 0.9034
Validation Accuracy: 0.8598
Epoch 8/10, Time: 0.18s, Loss: 32.9764, Train Acc: 0.9207
Validation Accuracy: 0.8468
Epoch 9/10, Time: 0.18s, Loss: 30.1700, Train Acc: 0.9290
Validation

# Have not touched code below this 
Save and Load Checkpoints

In [23]:
# Save final model checkpoint
final_checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'hyperparameters': {
        'input_size': input_size,
        'hidden_sizes': [512, 256, 128, 64],  # MLP architecture
        'output_size': 2,  # Binary classification
        'learning_rate': 0.0001,
        'epochs': epochs
    },
    'training_history': {
        'initial_task': 'SST-2',
        'transfer_task': 'IMDB',
        'sst2_performance': {
            'final_loss': loss.item(),
            'final_accuracy': accuracy
        },
        'imdb_performance': {
            'final_loss': loss.item(),  # Update with IMDB results after training
            'final_accuracy': accuracy  # Update with IMDB results
        }
    }
}

torch.save(final_checkpoint, 'mlp_text_classification_checkpoint.pth')
print("\nTraining complete! Final model checkpoint saved.")


NameError: name 'model' is not defined

# Load and Prepare the IMDB Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the IMDB dataset (Note: this dataset contains a "review" column and a "sentiment" column)
imdb_url = "https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv"
imdb_df = pd.read_csv(imdb_url)

# Convert sentiment to numerical labels (assuming 'positive' and 'negative')
imdb_df['label'] = imdb_df['sentiment'].apply(lambda x: 1 if x.lower() == 'positive' else 0)

# Split into training and validation sets (80% training, 20% validation)
imdb_train_df, imdb_val_df = train_test_split(imdb_df, test_size=0.2, random_state=42)

print(f"IMDB training samples: {len(imdb_train_df)}")
print(f"IMDB validation samples: {len(imdb_val_df)}")


#  Feature Extraction and Tensor Conversion

In [None]:
import torch

# Extract the text and labels from the IMDB training and validation sets
imdb_train_texts = imdb_train_df['review']
imdb_train_labels = imdb_train_df['label']
imdb_val_texts = imdb_val_df['review']
imdb_val_labels = imdb_val_df['label']

# Use the same CountVectorizer (Bag-of-Words) that was fit on the SST-2 dataset
# (Assuming 'vectorizer' has been previously defined and fitted on the training texts of SST-2)
X_train_imdb = vectorizer.transform(imdb_train_texts).toarray()
X_val_imdb = vectorizer.transform(imdb_val_texts).toarray()

# Convert features and labels to PyTorch tensors
X_train_imdb = torch.tensor(X_train_imdb, dtype=torch.float32)
X_val_imdb = torch.tensor(X_val_imdb, dtype=torch.float32)
y_train_imdb = torch.tensor(imdb_train_labels.values, dtype=torch.long)
y_val_imdb = torch.tensor(imdb_val_labels.values, dtype=torch.long)

print(f"IMDB BoW Training Features Shape: {X_train_imdb.shape}")
print(f"IMDB BoW Validation Features Shape: {X_val_imdb.shape}")


# Continual Learning – Fine-tune on IMDB Dataset

In [None]:
import torch.optim as optim
import torch.nn as nn

# Define or re-use your model, criterion, and optimizer.
# (Assuming 'model' is already defined and was trained on SST-2)
# For continual learning, you might use a smaller learning rate.
transfer_lr = 0.0001
optimizer = optim.Adam(model.parameters(), lr=transfer_lr)
criterion = nn.CrossEntropyLoss()
epochs = 10  # or as required

# Fine-tune the model on the IMDB training set
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_imdb)
    loss = criterion(outputs, y_train_imdb)
    loss.backward()
    optimizer.step()
    
    print(f"IMDB Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Evaluate on the IMDB validation set
model.eval()
with torch.no_grad():
    val_outputs = model(X_val_imdb)
    val_loss = criterion(val_outputs, y_val_imdb)
    val_preds = val_outputs.argmax(dim=1)
    accuracy = (val_preds == y_val_imdb).float().mean().item()

print(f"\nIMDB Validation Loss: {val_loss.item():.4f}, Accuracy: {accuracy:.4f}")


# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Predictions
model.eval()
with torch.no_grad():
    val_preds = model(X_val).argmax(dim=1)

# Compute Accuracy
accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {accuracy:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_val, val_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()


# Visualization (TensorBoard Integration)

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/text_classification")

for epoch in range(epochs):
    writer.add_scalar("Loss/train", loss.item(), epoch)
    writer.add_scalar("Loss/validation", val_loss.item(), epoch)

writer.close()
print("TensorBoard logs saved.")
