In [1]:
import torch

print(torch.cuda.is_available())

True


In [2]:
import torch
print(torch.version.cuda)

12.6


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import GradScaler, autocast
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import os

# Check GPU
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

class MiniTransformerEncoder(nn.Module):
    def __init__(self, vocab_size=30522, max_len=128, hidden_size=128, num_layers=2,
                 num_heads=8, ff_size=512, dropout=0.1, num_classes=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.max_len = max_len
        self.token_embedding = nn.Embedding(vocab_size, hidden_size)
        self.position_embedding = nn.Embedding(max_len, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size, nhead=num_heads, dim_feedforward=ff_size,
            dropout=dropout, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.layernorm = nn.LayerNorm(hidden_size)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.size()
        position_ids = torch.arange(seq_len, device=input_ids.device).expand(batch_size, seq_len)
        token_embeds = self.token_embedding(input_ids)
        position_embeds = self.position_embedding(position_ids)
        embeds = token_embeds + position_embeds
        embeds = self.embedding_dropout(embeds)
        
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        encoder_output = self.encoder(embeds, src_key_padding_mask=(attention_mask == 0))
        encoder_output = self.layernorm(encoder_output)
        cls_output = encoder_output[:, 0, :]
        return self.classifier(cls_output)
    
    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.texts = data["text"]
        self.labels = data["label"]
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

def train_model(model, train_loader, val_loader, device, criterion, epochs=3, lr=2e-5):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    # Remove device parameter for GradScaler, as it doesn't accept device keyword.
    scaler = GradScaler()
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]", leave=False)
        for batch in train_bar:
            optimizer.zero_grad()
            with autocast():
                logits = model(
                    batch["input_ids"].to(device),
                    batch["attention_mask"].to(device)
                )
                loss = criterion(logits, batch["label"].to(device))
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
            train_bar.set_postfix({"loss": loss.item()})
        
        # Validation
        model.eval()
        val_preds, val_labels = [], []
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]", leave=False):
                logits = model(
                    batch["input_ids"].to(device),
                    batch["attention_mask"].to(device)
                )
                loss = criterion(logits, batch["label"].to(device))
                val_loss += loss.item()
                val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                val_labels.extend(batch["label"].cpu().numpy())
        
        acc = accuracy_score(val_labels, val_preds)
        f1 = f1_score(val_labels, val_preds, average="binary")
        print(f"Epoch {epoch+1}: Train Loss = {train_loss/len(train_loader):.4f}, "
              f"Val Loss = {val_loss/len(val_loader):.4f}, Val Acc = {acc:.4f}, Val F1 = {f1:.4f}")
    
    return model

# Save model utility function
def save_model(model, path="sentiment_model.pth"):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

# Load model utility function
def load_model(path, config, device):
    model = MiniTransformerEncoder(**config)
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    print(f"Model loaded from {path}")
    return model

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load dataset properly
    try:
        dataset = load_from_disk("./imdb_supervised")
    except FileNotFoundError:
        dataset = load_dataset("imdb")
        dataset.save_to_disk("./imdb_supervised")
    
    # Prepare dataset splits
    train_val = dataset["train"].shuffle(seed=42).train_test_split(test_size=0.2, seed=42)
    train_data = train_val["train"]
    val_data = train_val["test"]
    test_data = dataset["test"].shuffle(seed=42).select(range(2500))
    
    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Model configuration
    config = {
        "vocab_size": tokenizer.vocab_size,
        "max_len": 128,
        "hidden_size": 128,
        "num_layers": 2,
        "num_heads": 8,
        "ff_size": 512,
        "dropout": 0.1,
        "num_classes": 2
    }
    
    criterion = nn.CrossEntropyLoss()
    
    # DataLoaders
    train_loader = DataLoader(SentimentDataset(train_data, tokenizer), 
                              batch_size=16, shuffle=True, num_workers=0)
    val_loader = DataLoader(SentimentDataset(val_data, tokenizer),
                            batch_size=16, num_workers=0)
    test_loader = DataLoader(SentimentDataset(test_data, tokenizer),
                             batch_size=16, num_workers=0)
    
    # Initialize and train the model
    model = MiniTransformerEncoder(**config)
    model = train_model(model, train_loader, val_loader, device, criterion, epochs=3)
    
    # Save the trained model
    save_model(model, "sentiment_model.pth")
    
    # Load the saved model for testing
    loaded_model = load_model("sentiment_model.pth", config, device)
    
    # Test the model on unseen test data
    loaded_model.eval()
    test_preds, test_labels = [], []
    test_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            logits = loaded_model(
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device)
            )
            loss = criterion(logits, batch["label"].to(device))
            test_loss += loss.item()
            test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            test_labels.extend(batch["label"].cpu().numpy())
    
    print(f"\nTest Loss: {test_loss/len(test_loader):.4f}")
    print(f"Test Accuracy: {accuracy_score(test_labels, test_preds):.4f}")
    print(f"Test F1: {f1_score(test_labels, test_preds, average='binary'):.4f}")

if __name__ == "__main__":
    main()


PyTorch version: 2.6.0+cu126
CUDA available: True
GPU: NVIDIA GeForce RTX 4080 Laptop GPU
VRAM: 12.88 GB
Using device: cuda


  scaler = GradScaler()
  with autocast():
  output = torch._nested_tensor_from_mask(
                                                                

Epoch 1: Train Loss = 0.6961, Val Loss = 0.6820, Val Acc = 0.5752, Val F1 = 0.6538


  with autocast():
                                                                                

Epoch 2: Train Loss = 0.6707, Val Loss = 0.6404, Val Acc = 0.6244, Val F1 = 0.5770


  with autocast():
                                                                                

Epoch 3: Train Loss = 0.6387, Val Loss = 0.6192, Val Acc = 0.6538, Val F1 = 0.6790
Model saved to sentiment_model.pth
Model loaded from sentiment_model.pth


Testing: 100%|██████████| 157/157 [00:02<00:00, 53.05it/s]


Test Loss: 0.6224
Test Accuracy: 0.6480
Test F1: 0.6784





In [9]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer

# === Define your model architecture ===
class MiniTransformerEncoder(nn.Module):
    def __init__(self, vocab_size=30522, max_len=128, hidden_size=128, num_layers=2,
                 num_heads=8, ff_size=512, dropout=0.1, num_classes=2):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, hidden_size)
        self.position_embedding = nn.Embedding(max_len, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size, nhead=num_heads, dim_feedforward=ff_size,
            dropout=dropout, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.layernorm = nn.LayerNorm(hidden_size)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.size()
        position_ids = torch.arange(seq_len, device=input_ids.device).expand(batch_size, seq_len)
        token_embeds = self.token_embedding(input_ids)
        position_embeds = self.position_embedding(position_ids)
        embeds = token_embeds + position_embeds
        embeds = self.embedding_dropout(embeds)

        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        encoder_output = self.encoder(embeds, src_key_padding_mask=(attention_mask == 0))
        encoder_output = self.layernorm(encoder_output)
        cls_output = encoder_output[:, 0, :]
        return self.classifier(cls_output)

# === Load model and tokenizer ===
def predict_single_text(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # Model config (same as used during training)
    config = {
        "vocab_size": tokenizer.vocab_size,
        "max_len": 128,
        "hidden_size": 128,
        "num_layers": 2,
        "num_heads": 8,
        "ff_size": 512,
        "dropout": 0.1,
        "num_classes": 2
    }

    # Load model
    model = MiniTransformerEncoder(**config)
    model.load_state_dict(torch.load("Path\sentiment_model.pth", map_location=device))
    model.to(device)
    model.eval()

    # Tokenize input
    encoded = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)

    # Predict
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()

    sentiment = "Positive 😊" if pred_class == 1 else "Negative 😞"
    confidence = probs[0][pred_class].item() * 100

    print(f"\nInput: {text}")
    print(f"Prediction: {sentiment} ({confidence:.2f}% confidence)")

# === Test your model ===
if __name__ == "__main__":
    test_input = "He copied my question paper bad guy"
    predict_single_text(test_input)



Input: He copied my question paper bad guy
Prediction: Negative 😞 (97.60% confidence)
