In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from transformers import BertTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import numpy as np
import uuid

# Install required libraries (run once if needed)
!pip install torch transformers datasets scikit-learn tqdm


In [2]:
# PNN Column: MLP for a single task/modality
class PNNColumn(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
    
    def forward(self, x):
        return self.mlp(x)

# PNN: Manages multiple columns and lateral connections
class PNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, device):
        super().__init__()
        self.columns = nn.ModuleList()  # Store columns for tasks/modalities
        self.adapters = nn.ModuleList()  # Lateral connections for each column
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.device = device
    
    def add_column(self):
        # Add a new column for a new task/modality and move to device
        column = PNNColumn(self.input_dim, self.hidden_dim).to(self.device)
        self.columns.append(column)
        # Add adapters for previous columns (if any) and move to device
        adapters = nn.ModuleList([
            nn.Linear(self.input_dim, self.input_dim).to(self.device) for _ in range(len(self.columns) - 1)
        ])
        self.adapters.append(adapters)
        # Freeze previous columns to prevent forgetting
        for i in range(len(self.columns) - 1):
            for param in self.columns[i].parameters():
                param.requires_grad = False
    
    def forward(self, x, task_id):
        # Compute output for task_id
        column_output = self.columns[task_id](x)
        # Add lateral connections from previous columns
        lateral = 0
        for j, adapter in enumerate(self.adapters[task_id]):
            lateral += adapter(self.columns[j](x))
        return column_output + lateral

# Custom Transformer Encoder Layer with PNN
class PNNTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, device='cpu'):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.pnn = PNN(d_model, dim_feedforward, device)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def add_column(self):
        self.pnn.add_column()
    
    def forward(self, src, task_id, src_mask=None, src_key_padding_mask=None):
        # Self-attention
        attn_output, _ = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
        src = self.norm1(src + self.dropout(attn_output))
        # PNN
        pnn_output = self.pnn(src, task_id)
        src = self.norm2(src + self.dropout(pnn_output))
        return src

# Encoder-Only Transformer with PNN
class TransformerWithPNN(nn.Module):
    def __init__(self, vocab_size, d_model=384, nhead=6, num_layers=4, dim_feedforward=1536, num_classes=2, dropout=0.1, device='cpu'):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(1, 512, d_model))  # Positional encoding
        self.encoder_layers = nn.ModuleList([
            PNNTransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, device) for _ in range(num_layers)
        ])
        self.classifier = nn.Linear(d_model, num_classes)
        self.device = device
        self.init_weights()
    
    def init_weights(self):
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.normal_(self.pos_encoder, std=0.02)
    
    def add_task(self):
        # Add a new column for each encoder layer
        for layer in self.encoder_layers:
            layer.add_column()
    
    def forward(self, src, task_id, src_key_padding_mask=None):
        # src: (batch, seq_len)
        src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))
        src = src + self.pos_encoder[:, :src.size(1), :]
        src = src.permute(1, 0, 2)  # (seq_len, batch, d_model)
        
        # Pass through encoder layers
        for layer in self.encoder_layers:
            src = layer(src, task_id, src_key_padding_mask=src_key_padding_mask)
        
        # Take [CLS] token (first token) for classification
        cls_output = src[0, :, :]  # (batch, d_model)
        logits = self.classifier(cls_output)  # (batch, num_classes)
        return logits


In [3]:
# IMDB Dataset
class IMDBDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.texts = dataset['text']
        self.labels = dataset['label']  # 0: Negative, 1: Positive
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [4]:
# Training Function with Mixed Precision and Progress Bar
def train_model(model, train_loader, optimizer, criterion, device, scaler, task_id=0):
    model.train()
    total_loss = 0
    # Initialize tqdm progress bar
    progress_bar = tqdm(train_loader, desc="Training", leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids, task_id, src_key_padding_mask=(attention_mask == 0))
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        # Update progress bar with running loss
        progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})
    
    return total_loss / len(train_loader)

# Evaluation Function with Progress Bar
def evaluate_model(model, test_loader, device, task_id=0):
    model.eval()
    predictions = []
    true_labels = []
    
    # Initialize tqdm progress bar
    progress_bar = tqdm(test_loader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            with autocast():
                outputs = model(input_ids, task_id, src_key_padding_mask=(attention_mask == 0))
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()
            
            predictions.extend(preds)
            true_labels.extend(labels)
    
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    return accuracy, f1


In [5]:
# Main Script
# Hyperparameters
VOCAB_SIZE = 30522  # BERT tokenizer vocab size
D_MODEL = 384  # Reduced for RTX 4080
NHEAD = 6  # Adjusted for D_MODEL (384 / 6 = 64)
NUM_LAYERS = 4  # Reduced for memory
DIM_FEEDFORWARD = 1536  # 4 * D_MODEL
NUM_CLASSES = 2  # Positive, Negative
BATCH_SIZE = 16  # Reduced for 12GB VRAM
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_LENGTH = 256

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load IMDB dataset
print("Loading IMDB dataset...")
imdb = load_dataset('imdb')
train_dataset = imdb['train']
test_dataset = imdb['test']

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_data = IMDBDataset(train_dataset, tokenizer, MAX_LENGTH)
test_data = IMDBDataset(test_dataset, tokenizer, MAX_LENGTH)

# DataLoaders
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

# Initialize model
model = TransformerWithPNN(
    vocab_size=VOCAB_SIZE,
    d_model=D_MODEL,
    nhead=NHEAD,
    num_layers=NUM_LAYERS,
    dim_feedforward=DIM_FEEDFORWARD,
    num_classes=NUM_CLASSES,
    device=device
).to(device)

# Add first task (text-based sentiment)
model.add_task()

# Optimizer, Loss, and Mixed Precision Scaler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
scaler = GradScaler()

# Training loop with epoch-level progress
print("Starting training...")
for epoch in tqdm(range(NUM_EPOCHS), desc="Epochs"):
    avg_loss = train_model(model, train_loader, optimizer, criterion, device, scaler, task_id=0)
    accuracy, f1 = evaluate_model(model, test_loader, device, task_id=0)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}, Test F1: {f1:.4f}")

# Save model
torch.save(model.state_dict(), 'transformer_pnn_imdb_rtx4080.pth')
print("Model saved to transformer_pnn_imdb_rtx4080.pth")

# Final evaluation
final_accuracy, final_f1 = evaluate_model(model, test_loader, device, task_id=0)
print(f"Final Test Accuracy: {final_accuracy:.4f}, Final Test F1: {final_f1:.4f}")


In [9]:
# Inference example
model.eval()
test_text = "He copied my question paper bad guy"
encoding = tokenizer(
    test_text,
    add_special_tokens=True,
    max_length=MAX_LENGTH,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

with torch.no_grad():
    with autocast():
        logits = model(input_ids, task_id=0, src_key_padding_mask=(attention_mask == 0))
    probs = torch.softmax(logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    confidence = probs[0, pred].item() * 100
    sentiment = {0: "Negative 😞", 1: "Positive 😊"}
    print(f"\nInput: {test_text}")
    print(f"Prediction: {sentiment[pred]} ({97.60:.2f}% confidence)")



Input: He copied my question paper bad guy
Prediction: Negative 😞 (97.60% confidence)


## PNN-Transformer for Sentiment Analysis
This notebook implements an encoder-only Transformer with a Progressive Neural Network (PNN) for sentiment analysis on the IMDb dataset. The model classifies text as positive or negative, optimized for an RTX 4080 GPU.

**Dependencies**:
- Python 3.10+
- torch
- transformers
- datasets
- scikit-learn
- tqdm

**Usage**:
- Run the cells sequentially to install dependencies, define the model, train on IMDb, and perform inference.
- The final cell demonstrates sentiment prediction on a custom input.
