#  RCV1 Text Classification Baseline

This script implements a strong baseline for the Reuters Corpus Volume I (RCV1) 
multi-label text classification task. The dataset contains over 800,000 news stories 
with multiple labels in three taxonomies: topics, industries, and regions.

We'll focus on creating an efficient and high-performing baseline using modern approaches 
in scikit-learn and transformers.



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import normalize
from sklearn.datasets import fetch_rcv1
from tqdm.auto import tqdm

# Transformer-related imports
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding,
    get_scheduler
)
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
import bitsandbytes as bnb

import warnings
import os
warnings.filterwarnings('ignore')

def set_seed(seed):
    """Set random seed for reproducibility."""
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def setup_device():
    """Setup device and print GPU information."""
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print(f'Using GPU: {torch.cuda.get_device_name(0)}')
        print(f'Number of GPUs available: {torch.cuda.device_count()}')
    else:
        device = torch.device('cpu')
        print('Using CPU')
    return device

def load_and_explore_dataset():
    """Load and explore the RCV1 dataset."""
    print('Loading RCV1 dataset from scikit-learn...')
    rcv1 = fetch_rcv1()
    
    # Convert sparse matrix to dense for text representation
    # We'll create text by using word indices and their TF-IDF values
    texts = []
    
    print('Converting feature vectors to text...')
    for i in tqdm(range(rcv1.data.shape[0])):
        # Get non-zero features for this document
        doc_features = rcv1.data[i].toarray().flatten()
        non_zero_indices = np.where(doc_features > 0)[0]
        
        # Create text by using word indices and their TF-IDF values
        text = []
        for idx in non_zero_indices:
            # Create a word-like token from the index
            word = f"word_{idx}"
            # Repeat based on TF-IDF value (rounded to integer)
            count = int(doc_features[idx] * 10)  # Scale up to get more repetitions
            text.extend([word] * count)
        
        texts.append(' '.join(text))
    
    # Convert to numpy array
    texts = np.array(texts)
    
    # Get labels
    labels = rcv1.target.toarray()
    
    print('\nDataset Features:')
    print(f'Number of samples: {len(texts)}')
    print(f'Number of labels: {labels.shape[1]}')
    
    # Show a sample
    print('\nSample document:')
    print(f'Text length: {len(texts[0])}')
    print(f'Labels: {labels[0]}')
    
    return texts, labels

def split_dataset(texts, labels):
    """Split the dataset into train, validation and test sets."""
    # Use the first 23,149 documents as training set (as per LYRL2004 split)
    train_size = 23149
    
    # Split the data
    X_train = texts[:train_size]
    X_test = texts[train_size:]
    y_train = labels[:train_size]
    y_test = labels[train_size:]
    
    # Create a validation set from training data
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train,
        test_size=0.1,
        random_state=42
    )
    
    print('Dataset splits:')
    print(f'Training set: {len(X_train)} samples')
    print(f'Validation set: {len(X_val)} samples')
    print(f'Test set: {len(X_test)} samples')
    
    return X_train, X_val, X_test, y_train, y_val, y_test

class BERTForMultiLabelClassification(nn.Module):
    """Modern transformer model for multi-label classification."""
    def __init__(self, num_labels, model_name='microsoft/deberta-v3-large'):
        super().__init__()
        self.num_labels = num_labels
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification"
        )
        
        # Add additional layers for better performance
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(1024, num_labels)
        )
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        
        # Use the last hidden state
        last_hidden_state = outputs.hidden_states[-1]
        pooled_output = torch.mean(last_hidden_state, dim=1)
        
        # Apply additional layers
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
            return {'loss': loss, 'logits': logits}
        
        return {'logits': logits}

class RCV1Dataset(Dataset):
    """Dataset class for RCV1."""
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.FloatTensor(labels)
        }

def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, pbar):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs['logits'], labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        pbar.update(1)
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)

def evaluate_bert_predictions(model, dataloader, device, pbar):
    """Evaluate BERT model predictions."""
    model.eval()
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].numpy()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.sigmoid(outputs['logits']).cpu().numpy() > 0.5
            
            all_labels.extend(labels)
            all_predictions.extend(predictions)
            pbar.update(1)
    
    all_labels = np.array(all_labels)
    all_predictions = np.array(all_predictions)
    
    metrics = {
        'Micro F1': f1_score(all_labels, all_predictions, average='micro'),
        'Macro F1': f1_score(all_labels, all_predictions, average='macro'),
        'Weighted F1': f1_score(all_labels, all_predictions, average='weighted'),
        'Samples F1': f1_score(all_labels, all_predictions, average='samples'),
        'Micro Precision': precision_score(all_labels, all_predictions, average='micro'),
        'Micro Recall': recall_score(all_labels, all_predictions, average='micro')
    }
    
    return metrics

def visualize_results(val_metrics, test_metrics):
    """Visualize model performance metrics."""
    # Create a comparison plot
    metrics_df = pd.DataFrame({
        'Validation': val_metrics,
        'Test': test_metrics
    }).reset_index()
    metrics_df.columns = ['Metric', 'Validation', 'Test']
    
    # Melt the dataframe for easier plotting
    melted_df = pd.melt(metrics_df, id_vars=['Metric'], var_name='Dataset', value_name='Score')
    
    # Create the plot
    plt.figure(figsize=(12, 6))
    sns.barplot(data=melted_df, x='Metric', y='Score', hue='Dataset')
    plt.xticks(rotation=45)
    plt.title('BERT Model Performance Metrics')
    plt.tight_layout()
    plt.show()

def main():
    """Main function to run the entire pipeline."""
    # Setup device
    device = setup_device()
    set_seed(42)
    
    # Load and explore dataset
    print('Loading RCV1 dataset from scikit-learn...')
    texts, labels = load_and_explore_dataset()
    
    # Split dataset
    X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(texts, labels)
    
    # Initialize model and tokenizer
    print('Loading model and tokenizer...')
    model_name = 'microsoft/deberta-v3-large'  # Using larger model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = BERTForMultiLabelClassification(
        num_labels=labels.shape[1],
        model_name=model_name
    )
    
    # Move model to device and wrap with DataParallel if multiple GPUs are available
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = nn.DataParallel(model)
    model = model.to(device)
    
    # Create datasets
    train_dataset = RCV1Dataset(X_train, y_train, tokenizer)
    val_dataset = RCV1Dataset(X_val, y_val, tokenizer)
    
    # Create dataloaders
    batch_size = 4  # Increased batch size
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    
    # Training settings
    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)
    num_warmup_steps = num_training_steps // 10
    
    # Use 8-bit AdamW optimizer for better memory efficiency
    optimizer = bnb.optim.AdamW8bit(
        model.parameters(),
        lr=2e-5,
        weight_decay=0.01,
        betas=(0.9, 0.999)
    )
    
    scheduler = get_scheduler(
        "cosine",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    criterion = BCEWithLogitsLoss()
    
    # Create a single progress bar for the entire training
    total_steps = num_epochs * (len(train_dataloader) + len(val_dataloader))
    pbar = tqdm(total=total_steps, desc='Training Progress')
    
    # Train BERT model
    print('Training model...')
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch + 1}/{num_epochs}')
        train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, criterion, device, pbar)
        print(f'Average training loss: {train_loss:.4f}')
        
        # Evaluate on validation set
        val_metrics = evaluate_bert_predictions(model, val_dataloader, device, pbar)
        print('\nValidation Metrics:')
        for metric_name, value in val_metrics.items():
            print(f'{metric_name}: {value:.4f}')
    
    # Create test dataset and dataloader
    test_dataset = RCV1Dataset(X_test, y_test, tokenizer)
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    
    # Evaluate on test set
    print('\nEvaluating on test set...')
    test_metrics = evaluate_bert_predictions(model, test_dataloader, device, pbar)
    print('\nTest Metrics:')
    for metric_name, value in test_metrics.items():
        print(f'{metric_name}: {value:.4f}')
    
    # Close progress bar
    pbar.close()
    
    # Visualize results
    visualize_results(val_metrics, test_metrics)

if __name__ == '__main__':
    main() 