## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\ProgramData\miniconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\madha\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\ProgramData\miniconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start


ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\ProgramData\miniconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\madha\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\ProgramData\miniconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "C:\Users\madha\Ap

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ModuleNotFoundError: Could not import module 'DistilBertForSequenceClassification'. Are this object's requirements defined correctly?

## 2. Load and Explore Data

In [None]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nFirst few rows:")
train_df.head()

In [None]:
# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum())

print("\nColumn names:")
print(train_df.columns.tolist())

print("\nData types:")
print(train_df.dtypes)

In [None]:
# Class distribution
class_names = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}

plt.figure(figsize=(10, 5))
train_df['Class Index'].value_counts().sort_index().plot(kind='bar')
plt.title('Class Distribution in Training Data')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(range(4), [class_names[i+1] for i in range(4)], rotation=45)
plt.tight_layout()
plt.show()

print("\nClass distribution:")
for idx, name in class_names.items():
    count = (train_df['Class Index'] == idx).sum()
    pct = count / len(train_df) * 100
    print(f"{name}: {count} ({pct:.2f}%)")

In [None]:
# Text length distribution
train_df['text_length'] = train_df['Title'].fillna('') + ' ' + train_df['Description'].fillna('')
train_df['text_length'] = train_df['text_length'].str.len()

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(train_df['text_length'], bins=50, edgecolor='black')
plt.title('Text Length Distribution')
plt.xlabel('Character Count')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.boxplot(train_df['text_length'])
plt.title('Text Length Box Plot')
plt.ylabel('Character Count')
plt.tight_layout()
plt.show()

print(f"\nText length statistics:")
print(train_df['text_length'].describe())

## 3. Data Preprocessing

In [None]:
# Combine Title and Description into a single text field
def prepare_data(df):
    df = df.copy()
    # Handle missing values
    df['Title'] = df['Title'].fillna('')
    df['Description'] = df['Description'].fillna('')
    
    # Combine title and description
    df['text'] = df['Title'] + ' ' + df['Description']
    
    # Convert class index to 0-based (for PyTorch)
    df['label'] = df['Class Index'] - 1
    
    return df[['text', 'label']]

train_data = prepare_data(train_df)
test_data = prepare_data(test_df)

print("Preprocessed training data:")
print(train_data.head())
print(f"\nLabel range: {train_data['label'].min()} to {train_data['label'].max()}")

In [None]:
# Split training data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['text'].tolist(),
    train_data['label'].tolist(),
    test_size=0.1,
    random_state=SEED,
    stratify=train_data['label']
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_data)}")

## 4. Create Dataset and DataLoader

In [None]:
# Initialize tokenizer
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Parameters
MAX_LENGTH = 128  # Most news snippets are short
BATCH_SIZE = 32

print(f"Tokenizer loaded: {MODEL_NAME}")
print(f"Max sequence length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
test_dataset = NewsDataset(
    test_data['text'].tolist(),
    test_data['label'].tolist(),
    tokenizer,
    MAX_LENGTH
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f"Created {len(train_loader)} training batches")
print(f"Created {len(val_loader)} validation batches")
print(f"Created {len(test_loader)} test batches")

In [None]:
# Test the dataset
sample_batch = next(iter(train_loader))
print("Sample batch:")
print(f"Input IDs shape: {sample_batch['input_ids'].shape}")
print(f"Attention mask shape: {sample_batch['attention_mask'].shape}")
print(f"Labels shape: {sample_batch['label'].shape}")
print(f"\nFirst input decoded: {tokenizer.decode(sample_batch['input_ids'][0])}")

## 5. Initialize Model

In [None]:
# Load pre-trained DistilBERT model
NUM_CLASSES = 4  # World, Sports, Business, Sci/Tech

model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_CLASSES
)

model = model.to(device)

print(f"Model loaded: {MODEL_NAME}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

## 6. Training Configuration

In [None]:
# Training hyperparameters
EPOCHS = 3
LEARNING_RATE = 2e-5
WARMUP_STEPS = 500

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)

# Learning rate scheduler
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

print(f"Epochs: {EPOCHS}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Warmup steps: {WARMUP_STEPS}")
print(f"Total training steps: {total_steps}")

## 7. Training and Evaluation Functions

In [None]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0
    
    progress_bar = tqdm(data_loader, desc='Training')
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += labels.size(0)
        losses.append(loss.item())
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{correct_predictions/total_predictions:.4f}'
        })
    
    return {
        'loss': np.mean(losses),
        'accuracy': correct_predictions.double() / total_predictions
    }

def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)
            losses.append(loss.item())
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return {
        'loss': np.mean(losses),
        'accuracy': correct_predictions.double() / total_predictions,
        'predictions': all_preds,
        'labels': all_labels
    }

print("Training and evaluation functions defined.")

## 8. Train the Model

In [None]:
# Training history
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

best_accuracy = 0

for epoch in range(EPOCHS):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"{'='*50}")
    
    # Train
    train_metrics = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"\nTraining - Loss: {train_metrics['loss']:.4f}, Accuracy: {train_metrics['accuracy']:.4f}")
    
    # Validate
    val_metrics = eval_model(model, val_loader, device)
    print(f"Validation - Loss: {val_metrics['loss']:.4f}, Accuracy: {val_metrics['accuracy']:.4f}")
    
    # Save history
    history['train_loss'].append(train_metrics['loss'])
    history['train_acc'].append(train_metrics['accuracy'].item())
    history['val_loss'].append(val_metrics['loss'])
    history['val_acc'].append(val_metrics['accuracy'].item())
    
    # Save best model
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"âœ“ Saved best model with validation accuracy: {best_accuracy:.4f}")

print("\n" + "="*50)
print("Training Complete!")
print(f"Best Validation Accuracy: {best_accuracy:.4f}")
print("="*50)

## 9. Visualize Training History

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Loss
ax1.plot(history['train_loss'], label='Training Loss', marker='o')
ax1.plot(history['val_loss'], label='Validation Loss', marker='s')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Accuracy
ax2.plot(history['train_acc'], label='Training Accuracy', marker='o')
ax2.plot(history['val_acc'], label='Validation Accuracy', marker='s')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training and Validation Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print("Training history saved as 'training_history.png'")

## 10. Evaluate on Test Set

In [None]:
# Load best model
model.load_state_dict(torch.load('best_model.pt'))
print("Loaded best model for testing")

# Evaluate on test set
test_metrics = eval_model(model, test_loader, device)

print(f"\n{'='*50}")
print("TEST SET RESULTS")
print(f"{'='*50}")
print(f"Test Loss: {test_metrics['loss']:.4f}")
print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
print(f"{'='*50}")

In [None]:
# Detailed classification report
from sklearn.metrics import classification_report

class_names_list = ['World', 'Sports', 'Business', 'Sci/Tech']

print("\nDetailed Classification Report:")
print("="*70)
print(classification_report(
    test_metrics['labels'],
    test_metrics['predictions'],
    target_names=class_names_list,
    digits=4
))

In [None]:
# Confusion Matrix
cm = confusion_matrix(test_metrics['labels'], test_metrics['predictions'])

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=class_names_list,
    yticklabels=class_names_list
)
plt.title('Confusion Matrix - Test Set', fontsize=14, pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix saved as 'confusion_matrix.png'")

In [None]:
# Per-class accuracy
print("\nPer-Class Accuracy:")
print("="*40)
for i, class_name in enumerate(class_names_list):
    class_mask = np.array(test_metrics['labels']) == i
    class_preds = np.array(test_metrics['predictions'])[class_mask]
    class_labels = np.array(test_metrics['labels'])[class_mask]
    accuracy = (class_preds == class_labels).mean()
    print(f"{class_name:12s}: {accuracy:.4f} ({int(accuracy * len(class_labels))}/{len(class_labels)} correct)")

## 11. Test Predictions on Sample Data

In [None]:
def predict_text(text, model, tokenizer, device, max_length=128):
    """Predict the class of a single text"""
    model.eval()
    
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1)
    
    return {
        'class': class_names_list[prediction.item()],
        'confidence': probabilities[0][prediction].item(),
        'probabilities': {class_names_list[i]: probabilities[0][i].item() for i in range(4)}
    }

# Test with sample texts
sample_texts = [
    "Apple announces new iPhone with revolutionary features and improved camera",
    "Scientists discover new planet in distant solar system using advanced telescope",
    "Stock market reaches all-time high as tech companies report strong earnings",
    "Tennis champion wins Grand Slam tournament in thrilling five-set match"
]

print("\nSample Predictions:")
print("="*80)
for i, text in enumerate(sample_texts, 1):
    result = predict_text(text, model, tokenizer, device, MAX_LENGTH)
    print(f"\n{i}. Text: {text}")
    print(f"   Predicted: {result['class']} (confidence: {result['confidence']:.4f})")
    print(f"   All probabilities: {', '.join([f'{k}: {v:.3f}' for k, v in result['probabilities'].items()])}")

## 12. Save Model and Tokenizer

In [None]:
# Save model and tokenizer for deployment
MODEL_PATH = './news_classifier_model'

model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

print(f"Model and tokenizer saved to: {MODEL_PATH}")
print("\nFiles saved:")
import os
for file in os.listdir(MODEL_PATH):
    print(f"  - {file}")

## 13. Load Model for Inference (Example)

In [None]:
# Example: How to load the saved model later
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load model and tokenizer
loaded_model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
loaded_tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
loaded_model = loaded_model.to(device)

print("Model loaded successfully!")

# Test with loaded model
test_text = "Breaking news: Major breakthrough in renewable energy technology announced"
result = predict_text(test_text, loaded_model, loaded_tokenizer, device, MAX_LENGTH)

print(f"\nTest prediction with loaded model:")
print(f"Text: {test_text}")
print(f"Predicted: {result['class']} (confidence: {result['confidence']:.4f})")

## 14. Integration Code for Your API

In [None]:
# Example code to integrate into your classifier.py
integration_code = '''
# Add to services/classifier.py

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load model once at module level
MODEL_PATH = "./news_classifier_model"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

try:
    classifier_model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
    classifier_tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
    classifier_model = classifier_model.to(device)
    classifier_model.eval()
    MODEL_LOADED = True
except:
    MODEL_LOADED = False
    print("Warning: News classifier model not found. Using fallback.")

def classify_misinformation(text: str) -> Dict[str, any]:
    """
    Classify text for misinformation detection using DistilBERT
    """
    if not MODEL_LOADED:
        # Fallback to random (as before)
        return {
            "is_misinformation": random.choice([True, False]),
            "confidence": round(random.uniform(0.5, 0.95), 2)
        }
    
    # Tokenize
    encoding = classifier_tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Predict
    with torch.no_grad():
        outputs = classifier_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1)
        confidence = probabilities[0][prediction].item()
    
    class_names = ['World', 'Sports', 'Business', 'Sci/Tech']
    predicted_class = class_names[prediction.item()]
    
    # For now, we're just doing news classification
    # You can extend this to detect misinformation based on the class
    return {
        "is_misinformation": False,  # TODO: Add misinformation detection logic
        "confidence": confidence,
        "news_category": predicted_class
    }
'''

print("Integration code for classifier.py:")
print("="*80)
print(integration_code)

## Summary

### Model Performance
- **Architecture**: DistilBERT (66M parameters)
- **Task**: 4-class news classification (World, Sports, Business, Sci/Tech)
- **Training Data**: 108k samples (90% of AG News train set)
- **Validation Data**: 12k samples (10% of AG News train set)
- **Test Data**: 7.6k samples

### Next Steps
1. âœ… Fine-tuned DistilBERT for news classification
2. ðŸ”„ **TODO**: Create misinformation detection dataset
3. ðŸ”„ **TODO**: Fine-tune model specifically for fake news detection
4. ðŸ”„ **TODO**: Integrate model into FastAPI backend
5. ðŸ”„ **TODO**: Add model caching and optimization for production

### Files Created
- `best_model.pt` - Best model checkpoint
- `news_classifier_model/` - Complete model for deployment
- `training_history.png` - Training visualization
- `confusion_matrix.png` - Model performance visualization