# RCV1 Text Classification Baseline

This notebook implements a strong baseline for the Reuters Corpus Volume I (RCV1) multi-label text classification task. The dataset contains over 800,000 news stories with multiple labels in three taxonomies: topics, industries, and regions.

We'll focus on creating an efficient and high-performing baseline using modern approaches in scikit-learn.

## 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.datasets import fetch_rcv1
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import normalize
from tqdm.notebook import tqdm

# Transformer-related imports
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from torch import nn
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
from torch.cuda import is_available

import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
print('Loading RCV1 dataset...')
rcv1 = fetch_rcv1()

# Print basic information
print('\nDataset Features:')
print(f'Number of samples: {rcv1.data.shape[0]}')
print(f'Number of features: {rcv1.data.shape[1]}')
print(f'Number of topics: {rcv1.target.shape[1]}')
print(f'Data type: {type(rcv1.data)}')
print(f'Target type: {type(rcv1.target)}')

# Show sparsity
print('\nSparsity statistics:')
print(f'Data sparsity: {rcv1.data.nnz / (rcv1.data.shape[0] * rcv1.data.shape[1]):.4%}')
print(f'Target sparsity: {rcv1.target.nnz / (rcv1.target.shape[0] * rcv1.target.shape[1]):.4%}')

## 3. Preprocess Data

The RCV1 dataset comes already preprocessed with TF-IDF features, but we'll normalize them to ensure better model performance.

In [None]:
# Normalize the features
X = normalize(rcv1.data, norm='l2', copy=False)
y = rcv1.target

# Convert to CSR format for efficiency
X = X.tocsr()
y = y.tocsr()

In [None]:
class RCV1Dataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Remove batch dimension added by tokenizer
        item = {
            key: val.squeeze(0) for key, val in encoding.items()
        }
        
        if self.labels is not None:
            item['labels'] = torch.FloatTensor(self.labels[idx])
            
        return item

## 4. Split Dataset

We'll use the official chronological split provided by LYRL2004:

In [None]:
# Use the first 23,149 documents as training set (as per LYRL2004 split)
train_size = 23149

# Split the data
X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

# Create a validation set from training data
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.1,
    random_state=42
)

print('Dataset splits:')
print(f'Training set: {X_train.shape[0]} samples')
print(f'Validation set: {X_val.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

## 5. Define Baseline Model

For our baseline, we'll use LogisticRegression with the following optimizations:
1. L2 regularization for better generalization
2. 'saga' solver for efficient optimization
3. Class weights to handle imbalance
4. MultiOutputClassifier for parallel training on multiple labels

In [None]:
# Initialize the base classifier
base_clf = LogisticRegression(
    C=4.0,  # Reduced regularization strength for better recall
    solver='saga',  # Efficient solver for large-scale data
    penalty='l2',
    max_iter=100,
    n_jobs=-1,  # Use all CPU cores
    random_state=42
)

# Wrap it in MultiOutputClassifier for multi-label classification
model = MultiOutputClassifier(base_clf, n_jobs=-1)

In [None]:
class BERTForMultiLabelClassification(nn.Module):
    def __init__(self, num_labels, model_name='distilbert-base-uncased'):
        super().__init__()
        self.num_labels = num_labels
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[0][:, 0]  # Get CLS token output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

## 6. Train Model

We'll train the model and monitor the progress:

In [None]:
print('Training the model...')
model.fit(X_train, y_train.toarray())

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc='Training')
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(dataloader)

# Initialize tokenizer and model
print('Loading DistilBERT tokenizer and model...')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = BERTForMultiLabelClassification(num_labels=y_train.shape[1]).to(device)

# Create datasets and dataloaders
batch_size = 16

train_dataset = RCV1Dataset(
    texts=[' '.join(rcv1.target_names[i] for i in doc.nonzero()[1]) for doc in X_train],
    labels=y_train.toarray(),
    tokenizer=tokenizer
)

val_dataset = RCV1Dataset(
    texts=[' '.join(rcv1.target_names[i] for i in doc.nonzero()[1]) for doc in X_val],
    labels=y_val.toarray(),
    tokenizer=tokenizer
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False
)

# Training settings
num_epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = BCEWithLogitsLoss()

# Training loop
print('Training BERT model...')
for epoch in range(num_epochs):
    print(f'\nEpoch {epoch + 1}/{num_epochs}')
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
    print(f'Average training loss: {train_loss:.4f}')

## 7. Evaluate Model

We'll evaluate the model using metrics appropriate for multi-label classification:

In [None]:
def evaluate_predictions(y_true, y_pred, dataset_name):
    """Evaluate predictions using multiple metrics"""
    # Convert sparse matrix to dense array if needed
    if sparse.issparse(y_true):
        y_true = y_true.toarray()
    
    metrics = {
        'Micro F1': f1_score(y_true, y_pred, average='micro'),
        'Macro F1': f1_score(y_true, y_pred, average='macro'),
        'Weighted F1': f1_score(y_true, y_pred, average='weighted'),
        'Samples F1': f1_score(y_true, y_pred, average='samples'),
        'Micro Precision': precision_score(y_true, y_pred, average='micro'),
        'Micro Recall': recall_score(y_true, y_pred, average='micro')
    }
    
    print(f'\nMetrics for {dataset_name}:')
    for metric_name, value in metrics.items():
        print(f'{metric_name}: {value:.4f}')
    
    return metrics

# Evaluate on validation set
val_pred = model.predict(X_val)
val_metrics = evaluate_predictions(y_val, val_pred, 'Validation Set')

# Evaluate on test set
test_pred = model.predict(X_test)
test_metrics = evaluate_predictions(y_test, test_pred, 'Test Set')

## 8. Visualize Results

Let's create a visualization of our metrics:

In [None]:
# Create a comparison plot
metrics_df = pd.DataFrame({
    'Validation': val_metrics,
    'Test': test_metrics
}).reset_index()
metrics_df.columns = ['Metric', 'Validation', 'Test']

# Melt the dataframe for easier plotting
melted_df = pd.melt(metrics_df, id_vars=['Metric'], var_name='Dataset', value_name='Score')

# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(data=melted_df, x='Metric', y='Score', hue='Dataset')
plt.xticks(rotation=45)
plt.title('Model Performance Metrics')
plt.tight_layout()
plt.show()

## Conclusion

This improved implementation uses DistilBERT, a lightweight and efficient transformer model, for multi-label text classification on the RCV1 dataset. The implementation includes:

1. Efficient batching and data loading with PyTorch
2. Modern transformer architecture (DistilBERT) fine-tuning
3. Proper handling of multi-label classification using BCE loss
4. GPU acceleration when available

Key advantages of this approach:
1. Better semantic understanding of text through pre-trained language model
2. Ability to handle out-of-vocabulary words
3. Context-aware representations
4. State-of-the-art performance








6. Use cross-validation for more robust evaluation5. Add regularization techniques4. Experiment with different pooling strategies3. Use learning rate scheduling2. Implement gradient accumulation for larger batch sizes1. Try different transformer models (BERT, RoBERTa, XLNet)To further improve the results, you could:


















































    print(f'{metric_name}: {value:.4f}')for metric_name, value in test_metrics.items():print('\nTest Metrics:')test_metrics = evaluate_bert_predictions(model, test_dataloader, device)# Evaluate on test set)    shuffle=False    batch_size=batch_size,    test_dataset,test_dataloader = DataLoader()    tokenizer=tokenizer    labels=y_test.toarray(),    texts=[' '.join(rcv1.target_names[i] for i in doc.nonzero()[1]) for doc in X_test],test_dataset = RCV1Dataset(# Create test dataset and dataloader    print(f'{metric_name}: {value:.4f}')for metric_name, value in val_metrics.items():print('\nValidation Metrics:')val_metrics = evaluate_bert_predictions(model, val_dataloader, device)print('\nEvaluating BERT model...')# Evaluate on validation set    return metrics        }        'Micro Recall': recall_score(all_labels, all_predictions, average='micro')        'Micro Precision': precision_score(all_labels, all_predictions, average='micro'),        'Samples F1': f1_score(all_labels, all_predictions, average='samples'),


        'Weighted F1': f1_score(all_labels, all_predictions, average='weighted'),        'Macro F1': f1_score(all_labels, all_predictions, average='macro'),plt.show()plt.tight_layout()plt.title('BERT Model Performance Metrics')plt.xticks(rotation=45)sns.barplot(data=melted_df, x='Metric', y='Score', hue='Dataset')plt.figure(figsize=(12, 6))


        'Micro F1': f1_score(all_labels, all_predictions, average='micro'),





    metrics = {        all_labels = np.array(all_labels)    all_predictions = np.array(all_predictions)                all_labels.extend(labels)            all_predictions.extend(predictions)

                        predictions = torch.sigmoid(outputs).cpu().numpy() > 0.5            outputs = model(input_ids=input_ids, attention_mask=attention_mask)# Create the plotmelted_df = pd.melt(metrics_df, id_vars=['Metric'], var_name='Dataset', value_name='Score')


                        labels = batch['labels'].numpy()            attention_mask = batch['attention_mask'].to(device)# Melt the dataframe for easier plottingmetrics_df.columns = ['Metric', 'Validation', 'Test']}).reset_index()    'Test': test_metrics    'Validation': val_metrics,


            input_ids = batch['input_ids'].to(device)        for batch in tqdm(dataloader, desc='Evaluating'):


    with torch.no_grad():        all_labels = []    all_predictions = []metrics_df = pd.DataFrame({# Create a comparison plot
def evaluate_bert_predictions(model, dataloader, device):
    model.eval()