In [13]:
import os
import numpy as np
import pandas as pd

from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline
)
import evaluate
import torch


In [14]:
# Load AG News dataset (train & test splits)
dataset = load_dataset("ag_news")

# Convert train split to pandas DataFrame
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

# Save to CSV files
train_df.to_csv("ag_news_train.csv", index=False)
test_df.to_csv("ag_news_test.csv", index=False)

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)

# Show first few rows
train_df.head()


Training data shape: (120000, 2)
Testing data shape: (7600, 2)


Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [15]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization on train & test datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Keep only useful columns
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})

In [16]:
# Manual training approach - bypassing Trainer issues
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import os

# Check and set up GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load BERT model for sequence classification (4 classes in AG News)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=4,
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1
)

# Move model to GPU if available
model = model.to(device)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Create output directory
os.makedirs("./bert-news-classifier", exist_ok=True)
os.makedirs("./logs", exist_ok=True)

print("Model and optimizer setup complete!")
print(f"Model device: {next(model.parameters()).device}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and optimizer setup complete!
Model device: cuda:0
Number of parameters: 109,485,316


In [17]:
! pip install -U transformers




In [18]:

# Core imports
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

# Mixed precision for GPU efficiency
from torch.cuda.amp import GradScaler, autocast

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
if device.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))


Device: cuda
GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [19]:
# Shuffle + prepare datasets
train_dataset = tokenized_datasets["train"].shuffle(seed=42)   # ~120k samples
eval_dataset = tokenized_datasets["test"]                      # ~7.6k samples

# Use smaller batch size to save GPU
batch_size = 4  

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True,
    num_workers=2, pin_memory=True, persistent_workers=True
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, shuffle=False,
    num_workers=2, pin_memory=True, persistent_workers=True
)

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")
print(f"Training batches: {len(train_dataloader)}")
print(f"Evaluation batches: {len(eval_dataloader)}")


Training samples: 120000
Evaluation samples: 7600
Training batches: 30000
Evaluation batches: 1900


In [20]:
# Move model to device
model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Mixed precision scaler
scaler = GradScaler()


  scaler = GradScaler()


In [None]:
num_epochs = 2
gradient_accumulation_steps = 4  # simulate larger batch size

total_steps = 0
best_eval_loss = float("inf")

print("Starting training...")

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Training phase
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")
    
    for step, batch in enumerate(progress_bar):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps  # scale loss
        
        scaler.scale(loss).backward()
        
        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        total_loss += loss.item()
        total_steps += 1
        
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss:.4f}")


Starting training...

Epoch 1/2


Training Epoch 1:   0%|          | 0/30000 [00:00<?, ?it/s]

  with autocast():


In [10]:
import pickle

# Create a dictionary with all important objects
model_artifacts = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'tokenizer': tokenizer,
    'dataset': dataset,
    'tokenized_datasets': tokenized_datasets,
    'device': str(device),
    'num_epochs': num_epochs,
    'batch_size': batch_size,
    'best_eval_loss': best_eval_loss,
    'model_config': model.config,
    'train_df': train_df,
    'test_df': test_df
}

# Save to pickle file
pickle_filename = 'bert_news_classifier.pkl'
with open(pickle_filename, 'wb') as f:
    pickle.dump(model_artifacts, f)

print(f"✓ Successfully saved model artifacts to '{pickle_filename}'")
print(f"File size: {os.path.getsize(pickle_filename) / (1024**2):.2f} MB")
print("\nSaved objects:")
for key in model_artifacts.keys():
    print(f"  - {key}")


✓ Successfully saved model artifacts to 'bert_news_classifier.pkl'
File size: 1283.89 MB

Saved objects:
  - model_state_dict
  - optimizer_state_dict
  - tokenizer
  - dataset
  - tokenized_datasets
  - device
  - num_epochs
  - batch_size
  - best_eval_loss
  - model_config
  - train_df
  - test_df


In [11]:
import pickle
import torch
from transformers import AutoModelForSequenceClassification

# Load the pickle file
pickle_path = r'C:\Users\Haram\Downloads\bert_news_classifier.pkl'

print("Loading pickle file...")
with open(pickle_path, 'rb') as f:
    loaded_artifacts = pickle.load(f)

print("✓ Pickle file loaded successfully!\n")

# Display what was loaded
print("Loaded objects:")
for key in loaded_artifacts.keys():
    print(f"  - {key}")

# Recreate the model
print("\n" + "="*60)
print("Recreating model from saved state...")
loaded_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=4
)
loaded_model.load_state_dict(loaded_artifacts['model_state_dict'])
loaded_model.eval()

# Move to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)
print(f"✓ Model loaded and moved to {device}")

# Get tokenizer from pickle
loaded_tokenizer = loaded_artifacts['tokenizer']

# AG News label mapping
label_map = {
    0: "World",
    1: "Sports", 
    2: "Business",
    3: "Sci/Tech"
}

print("\n" + "="*60)
print("TESTING THE MODEL")
print("="*60)

# Test samples - one from each category
test_samples = [
    "Apple unveils new iPhone with advanced AI capabilities and improved camera system.",
    "Manchester United defeats Barcelona 3-1 in Champions League final match.",
    "Stock market reaches all-time high as tech companies report strong earnings.",
    "NASA discovers evidence of water on Mars, raising hopes for future colonization."
]

for i, text in enumerate(test_samples, 1):
    print(f"\n--- Test Sample {i} ---")
    print(f"Input: {text[:80]}...")
    
    # Tokenize input
    inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get prediction
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(logits, dim=-1).item()
        confidence = probabilities[0][predicted_class].item()
    
    print(f"Predicted Category: {label_map[predicted_class]}")
    print(f"Confidence: {confidence:.2%}")
    print(f"All probabilities: {dict(zip(label_map.values(), probabilities[0].cpu().numpy()))}")

print("\n" + "="*60)
print("✓ Testing completed successfully!")
print("="*60)


Loading pickle file...
✓ Pickle file loaded successfully!

Loaded objects:
  - model_state_dict
  - optimizer_state_dict
  - tokenizer
  - dataset
  - tokenized_datasets
  - device
  - num_epochs
  - batch_size
  - best_eval_loss
  - model_config
  - train_df
  - test_df

Recreating model from saved state...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded and moved to cuda

TESTING THE MODEL

--- Test Sample 1 ---
Input: Apple unveils new iPhone with advanced AI capabilities and improved camera syste...
Predicted Category: Sci/Tech
Confidence: 98.83%
All probabilities: {'World': 0.0013335935, 'Sports': 0.00017650673, 'Business': 0.010206983, 'Sci/Tech': 0.988283}

--- Test Sample 2 ---
Input: Manchester United defeats Barcelona 3-1 in Champions League final match....
Predicted Category: Sports
Confidence: 51.10%
All probabilities: {'World': 0.48177612, 'Sports': 0.5110066, 'Business': 0.004407131, 'Sci/Tech': 0.0028101597}

--- Test Sample 3 ---
Input: Stock market reaches all-time high as tech companies report strong earnings....
Predicted Category: Sci/Tech
Confidence: 66.36%
All probabilities: {'World': 0.029234715, 'Sports': 0.00078330067, 'Business': 0.30638018, 'Sci/Tech': 0.66360176}

--- Test Sample 4 ---
Input: NASA discovers evidence of water on Mars, raising hopes for future colonization....
Predicted Category: