In [8]:
# Force reinstall with specific versions
!pip uninstall accelerate transformers -y
!pip install transformers==4.36.0 accelerate==0.26.1 --no-cache-dir
!pip install torch datasets scikit-learn pandas --quiet

# Restart kernel after installation if in Jupyter/Colab

Found existing installation: accelerate 1.8.1
Uninstalling accelerate-1.8.1:
  Successfully uninstalled accelerate-1.8.1
Found existing installation: transformers 4.53.2
Uninstalling transformers-4.53.2:
  Successfully uninstalled transformers-4.53.2
Collecting transformers==4.36.0
  Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)
Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.0)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.36.0-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.26.1-py3-none-any.whl (270 kB)
Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from torch.optim import AdamW
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

# Check if GPU is available and set up mixed precision
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    # Enable TF32 on Ampere GPUs (3090 Ti)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    print("TF32 enabled for faster training on 3090 Ti")

# Load and prepare the dataset
import os

# Check if balanced dataset already exists
if os.path.exists("balanced_orcas_dataset.tsv"):
    print("Loading existing balanced dataset...")
    df = pd.read_csv("balanced_orcas_dataset.tsv", sep="\t")
    balanced_df = df
    print(f"Loaded {len(df)} balanced samples")
else:
    print("Downloading and creating balanced dataset...")
    url = "https://researchdata.tuwien.ac.at/records/pp7xz-n9a06/files/ORCAS-I-18M.tsv?download=1"
    
    # Check if original dataset exists
    if os.path.exists("ORCAS-I-18M.tsv"):
        print("Loading existing original dataset...")
        df = pd.read_csv("ORCAS-I-18M.tsv", sep="\t")
    else:
        print("Downloading original dataset...")
        df = pd.read_csv(url, sep="\t")
        # Save original dataset for future use
        df.to_csv("ORCAS-I-18M.tsv", sep="\t", index=False)

    # Filter to necessary columns
    df = df[["query", "level_1"]].dropna().drop_duplicates()

    # Count samples in each category
    category_counts = df["level_1"].value_counts()
    print("\nOriginal counts:")
    print(category_counts)

    # Find the lowest category and sample equally
    lowest = category_counts.min()
    print(f"\nLowest category count: {lowest}")

    # Sample equal numbers from each category
    balanced_dfs = []
    for category in category_counts.index:
        balanced_dfs.append(
            df[df["level_1"] == category].sample(n=lowest, random_state=42)
        )

    # Combine the balanced datasets
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)

    print(f"\nBalanced dataset size: {len(balanced_df)}")
    print("Category distribution:")
    print(balanced_df["level_1"].value_counts())

    # Save the balanced dataset
    balanced_df.to_csv("balanced_orcas_dataset.tsv", sep="\t", index=False)
    print("Balanced dataset saved.")

# Prepare for training
df = balanced_df

# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["level_1"])
label_map = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("\nLabel mapping:", label_map)

# Split train/val
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["query"].tolist(),
    df["label"].tolist(),
    test_size=0.1,
    stratify=df["label"],
    random_state=42
)

print(f"\nTraining samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Dataset class
# Dataset class with optimized tokenization and caching
class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64, device='cuda'):
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.max_len = max_len
        self.device = device
        
        # Batch tokenization for efficiency
        print("Tokenizing dataset in batches...")
        batch_size = 5000  # Process in chunks
        all_input_ids = []
        all_attention_masks = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize batch
            encodings = tokenizer(
                batch_texts,
                truncation=True,
                padding='max_length',
                max_length=max_len,
                return_tensors='pt'
            )
            
            # Move to GPU immediately to save CPU-GPU transfer time later
            if device == 'cuda':
                all_input_ids.append(encodings['input_ids'].to(device))
                all_attention_masks.append(encodings['attention_mask'].to(device))
            else:
                all_input_ids.append(encodings['input_ids'])
                all_attention_masks.append(encodings['attention_mask'])
        
        # Concatenate all batches
        self.input_ids = torch.cat(all_input_ids, dim=0)
        self.attention_masks = torch.cat(all_attention_masks, dim=0)
        
        # Move labels to GPU if needed
        if device == 'cuda':
            self.labels = self.labels.to(device)
        
        print(f"Dataset ready! Tensors on {device}")
        if device == 'cuda':
            print(f"GPU memory used by dataset: {(self.input_ids.element_size() * self.input_ids.nelement() + self.attention_masks.element_size() * self.attention_masks.nelement()) / 1024**3:.2f} GB")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Everything is already on GPU, just index
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create datasets with GPU tensors
train_dataset = IntentDataset(train_texts, train_labels, tokenizer, device=device.type)
val_dataset = IntentDataset(val_texts, val_labels, tokenizer, device=device.type)

# Create data loaders - data is already on GPU!
# Use larger batches since no CPU-GPU transfer overhead
train_loader = DataLoader(
    train_dataset, 
    batch_size=500,   # Even larger now
    shuffle=True,
    num_workers=0,     # No workers needed - data is on GPU
    pin_memory=False   # Not needed - already on GPU
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=500,   # Very large for validation
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

print(f"\nDataLoader info:")
print(f"Training batches per epoch: {len(train_loader)}")
print(f"Validation batches per epoch: {len(val_loader)}")
print(f"All data pre-loaded on GPU - no CPU-GPU transfer needed!")

# Initialize model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)
model.to(device)

# Check if a trained model already exists
model_path = './best_intent_classifier'
if os.path.exists(model_path):
    print(f"\nFound existing model at {model_path}")
    user_input = input("Do you want to load the existing model instead of training? (y/n): ")
    if user_input.lower() == 'y':
        print("Loading existing model...")
        model = RobertaForSequenceClassification.from_pretrained(model_path)
        model.to(device)
        tokenizer = RobertaTokenizer.from_pretrained(model_path)
        skip_training = True
    else:
        skip_training = False
else:
    skip_training = False

# Manual Training Implementation with GPU-resident data
def train_epoch(model, data_loader, optimizer, scheduler, device, scaler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    progress_bar = tqdm(data_loader, desc="Training")
    
    for batch in progress_bar:
        # Data is already on GPU, no transfer needed!
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        # Mixed precision training
        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits
        
        # Calculate accuracy
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)
        
        # Backward pass with gradient scaling
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        
        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()  # Update learning rate
        
        total_loss += loss.item()
        
        # Update progress bar with more info
        current_lr = scheduler.get_last_lr()[0]
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{correct_predictions / total_predictions:.4f}',
            'lr': f'{current_lr:.2e}',
            'gpu_mem': f'{torch.cuda.memory_allocated() / 1024**3:.1f}GB'
        })
    
    return total_loss / len(data_loader), correct_predictions / total_predictions

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            # Data already on GPU
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            
            with autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
                logits = outputs.logits
            
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)
            
            total_loss += loss.item()
    
    return total_loss / len(data_loader), correct_predictions / total_predictions

# Training configuration optimized for 3090 Ti
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)  # Higher LR for larger batches
num_epochs = 5
scaler = GradScaler()  # For mixed precision training

# Learning rate scheduler for better convergence
from transformers import get_linear_schedule_with_warmup
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(0.1 * total_steps),  # 10% warmup
    num_training_steps=total_steps
)

# Training loop
if not skip_training:
    print("\nStarting training...")
    print(f"Training with batch size: {train_loader.batch_size}")
    print(f"Steps per epoch: {len(train_loader)}")
    print(f"Total training steps: {len(train_loader) * num_epochs}")
    
    # Monitor GPU usage
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"GPU memory before training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    
    best_val_acc = 0

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        # Train
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device, scaler)
        print(f"Training - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
        
        if torch.cuda.is_available():
            print(f"GPU memory usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"GPU utilization: Check nvidia-smi")
        
        # Evaluate
        val_loss, val_acc = evaluate(model, val_loader, device)
        print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            model.save_pretrained('./best_intent_classifier')
            tokenizer.save_pretrained('./best_intent_classifier')
            print(f"Saved best model with validation accuracy: {val_acc:.4f}")

    # Save final model
    model.save_pretrained('./intent_classifier_final')
    tokenizer.save_pretrained('./intent_classifier_final')
    print("\n✓ Training completed! Final model saved.")
    print(f"Best validation accuracy: {best_val_acc:.4f}")
else:
    print("\nSkipping training, using loaded model.")
    # Still evaluate the loaded model
    val_loss, val_acc = evaluate(model, val_loader, device)
    print(f"Loaded model validation - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")

# Test the model
def predict_intent(text, model, tokenizer, label_encoder, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=32
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
        predicted_label = label_encoder.inverse_transform([prediction])[0]
        
        # Get confidence scores
        probs = torch.softmax(outputs.logits, dim=-1).squeeze().cpu().numpy()
        
    return predicted_label, probs

# Test with sample queries
test_queries = [
    "github",
    "how to cook pasta",
    "buy shoes online",
    "weather forecast tomorrow",
    "login to my account",
    "facebook",
    "best laptop under $1000",
    "amazon prime membership"
]

print("\n" + "="*50)
print("Testing model with sample queries:")
print("="*50)

for query in test_queries:
    predicted_label, probs = predict_intent(query, model, tokenizer, label_encoder, device)
    print(f"\nQuery: '{query}'")
    print(f"Prediction: {predicted_label}")
    print(f"Confidence scores:")
    for label, prob in zip(label_encoder.classes_, probs):
        print(f"  {label}: {prob:.3f}")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Using device: cuda
GPU: NVIDIA GeForce RTX 3090 Ti
Memory: 22.49 GB
TF32 enabled for faster training on 3090 Ti
Loading existing balanced dataset...
Loaded 1311735 balanced samples

Label mapping: {'Informational': 0, 'Navigational': 1, 'Transactional': 2}





Training samples: 1180561
Validation samples: 131174
Tokenizing dataset in batches...


Tokenizing: 100%|██████████| 237/237 [00:44<00:00,  5.28it/s]


Dataset ready! Tensors on cuda
GPU memory used by dataset: 1.13 GB
Tokenizing dataset in batches...


Tokenizing: 100%|██████████| 27/27 [00:04<00:00,  6.13it/s]


Dataset ready! Tensors on cuda
GPU memory used by dataset: 0.13 GB

DataLoader info:
Training batches per epoch: 2362
Validation batches per epoch: 263
All data pre-loaded on GPU - no CPU-GPU transfer needed!


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()  # For mixed precision training



Starting training...
Training with batch size: 500
Steps per epoch: 2362
Total training steps: 11810
GPU memory before training: 1.73 GB

Epoch 1/5


  with autocast():
Training:  48%|████▊     | 1142/2362 [07:49<07:39,  2.65it/s, loss=0.2760, acc=0.8093, lr=9.67e-05, gpu_mem=3.1GB]