In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset
import re
from tqdm import tqdm

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print(f"Using device: {torch.cuda.is_available() and 'cuda' or 'cpu'}")

## 1. Load and Explore Datasets

In [None]:
# Load AG News dataset
df_ag_news = pd.read_csv('train.csv')
print("=== AG News Dataset ===")
print(f"Shape: {df_ag_news.shape}")
print(f"Columns: {df_ag_news.columns.tolist()}")
print(df_ag_news.head(3))
print()

# Load ABC News dataset
df_abc_news = pd.read_csv('abcnews-date-text.csv')
print("=== ABC News Dataset ===")
print(f"Shape: {df_abc_news.shape}")
print(f"Columns: {df_abc_news.columns.tolist()}")
print(df_abc_news.head(3))
print()

# Load Twitter dataset (negative data)
# Columns: sentiment, id, date, query, user, text
df_tweets = pd.read_csv('negative_data.csv', encoding='latin-1', header=None, 
                        names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
print("=== Twitter Dataset ===")
print(f"Shape: {df_tweets.shape}")
print(f"Columns: {df_tweets.columns.tolist()}")
print(df_tweets.head(3))

## 2. Data Preprocessing

### Strategy:
- **AG News**: Combine Title + Description for richer context
- **ABC News**: Use headline_text as-is
- **Tweets**: Extract text column only
- **Labels**: 0 = News, 1 = Not News (tweets)

In [None]:
def clean_text(text):
    """Clean and normalize text"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags from tweets (but keep the text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)  # Remove # but keep word
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Test the function
print("Testing clean_text function:")
test_text = "@user Check this out! https://example.com #breaking #news   extra  spaces"
print(f"Original: {test_text}")
print(f"Cleaned: {clean_text(test_text)}")

In [None]:
# Process AG News: Combine Title + Description
print("Processing AG News...")
df_ag_news['text'] = df_ag_news['Title'].fillna('') + ' ' + df_ag_news['Description'].fillna('')
df_ag_news['text'] = df_ag_news['text'].apply(clean_text)
df_ag_news['label'] = 0  # News
df_ag_news = df_ag_news[['text', 'label']]

# Filter out very short texts (likely corrupted)
df_ag_news = df_ag_news[df_ag_news['text'].str.len() >= 10]

print(f"AG News processed: {len(df_ag_news)} samples")
print(df_ag_news.head(2))

In [None]:
# Process ABC News: Use headline text
print("\nProcessing ABC News...")
df_abc_news['text'] = df_abc_news['headline_text'].apply(clean_text)
df_abc_news['label'] = 0  # News
df_abc_news = df_abc_news[['text', 'label']]

# Filter out very short headlines
df_abc_news = df_abc_news[df_abc_news['text'].str.len() >= 10]

# Sample to balance with AG News (we have 1.2M ABC news, let's take 120k)
df_abc_news_sampled = df_abc_news.sample(n=120000, random_state=42)

print(f"ABC News processed: {len(df_abc_news_sampled)} samples (sampled from {len(df_abc_news)})")
print(df_abc_news_sampled.head(2))

In [None]:
# Process Tweets: Extract text column
print("\nProcessing Tweets...")
df_tweets['text'] = df_tweets['text'].apply(clean_text)
df_tweets['label'] = 1  # Not News
df_tweets = df_tweets[['text', 'label']]

# Filter out very short tweets
df_tweets = df_tweets[df_tweets['text'].str.len() >= 10]

# Sample to balance with news data (take ~240k tweets to match total news)
df_tweets_sampled = df_tweets.sample(n=240000, random_state=42)

print(f"Tweets processed: {len(df_tweets_sampled)} samples (sampled from {len(df_tweets)})")
print(df_tweets_sampled.head(2))

## 3. Combine All Datasets

In [None]:
# Combine all datasets
print("\nCombining datasets...")
df_combined = pd.concat([
    df_ag_news,
    df_abc_news_sampled,
    df_tweets_sampled
], ignore_index=True)

# Shuffle the combined dataset
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nCombined dataset shape: {df_combined.shape}")
print(f"\nClass distribution:")
print(df_combined['label'].value_counts())
print(f"\nPercentages:")
print(df_combined['label'].value_counts(normalize=True) * 100)

# Show samples from each class
print("\n=== Sample News (label=0) ===")
print(df_combined[df_combined['label'] == 0].head(3))
print("\n=== Sample Not News/Tweets (label=1) ===")
print(df_combined[df_combined['label'] == 1].head(3))

## 4. Create Train/Validation/Test Splits

In [None]:
# Split: 80% train, 10% validation, 10% test
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df_combined['text'].values,
    df_combined['label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df_combined['label'].values
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42,
    stratify=temp_labels
)

print(f"Train set: {len(train_texts)} samples")
print(f"Validation set: {len(val_texts)} samples")
print(f"Test set: {len(test_texts)} samples")
print(f"\nTrain class distribution: {np.bincount(train_labels)}")
print(f"Val class distribution: {np.bincount(val_labels)}")
print(f"Test class distribution: {np.bincount(test_labels)}")

## 5. Tokenization with DistilBERT

In [None]:
# Load DistilBERT tokenizer
print("Loading DistilBERT tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=128,  # News headlines are typically short
        return_tensors='pt'
    )

print("Tokenizing datasets...")
print("This may take a few minutes...")

# Create datasets
train_dataset = Dataset.from_dict({
    'text': train_texts.tolist(),
    'label': train_labels.tolist()
})

val_dataset = Dataset.from_dict({
    'text': val_texts.tolist(),
    'label': val_labels.tolist()
})

test_dataset = Dataset.from_dict({
    'text': test_texts.tolist(),
    'label': test_labels.tolist()
})

# Tokenize
def tokenize_batch(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_batch, batched=True)
val_dataset = val_dataset.map(tokenize_batch, batched=True)
test_dataset = test_dataset.map(tokenize_batch, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("Tokenization complete!")

## 6. Initialize DistilBERT Model

In [None]:
# Load pre-trained DistilBERT for binary classification
print("Loading DistilBERT model...")
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # Binary classification: News vs Not News
)

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Model loaded and moved to {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

## 7. Training Configuration

In [None]:
# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    eval_strategy='steps',
    eval_steps=1000,
    save_strategy='steps',
    save_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Train batch size: {training_args.per_device_train_batch_size}")
print(f"  Eval batch size: {training_args.per_device_eval_batch_size}")
print(f"  Mixed precision (fp16): {training_args.fp16}")

## 8. Train the Model

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start training
print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80 + "\n")

train_result = trainer.train()

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training samples/second: {train_result.metrics['train_samples_per_second']:.2f}")

## 9. Evaluate on Test Set

In [None]:
# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print("\n" + "="*80)
print("TEST SET RESULTS")
print("="*80)
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

# Get predictions for confusion matrix
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Confusion matrix
cm = confusion_matrix(test_labels, pred_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['News', 'Not News'],
            yticklabels=['News', 'Not News'])
plt.title('Confusion Matrix - News vs Not News Classifier')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nConfusion Matrix saved as 'confusion_matrix.png'")

## 10. Save the Model

In [None]:
# Save the fine-tuned model
model_save_path = './news_classifier_model'
print(f"\nSaving model to {model_save_path}...")

trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("Model and tokenizer saved successfully!")
print(f"\nTo load the model later:")
print(f"  tokenizer = DistilBertTokenizer.from_pretrained('{model_save_path}')")
print(f"  model = DistilBertForSequenceClassification.from_pretrained('{model_save_path}')")

## 11. Test with Real Examples

In [None]:
def predict_text(text, model, tokenizer, device):
    """Predict if text is news or not news"""
    model.eval()
    
    # Tokenize
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][prediction].item()
    
    label = 'News' if prediction == 0 else 'Not News'
    return label, confidence

# Test examples
test_examples = [
    "NASA announces new Mars rover discovery",
    "just had the best coffee ever!! #blessed #love",
    "Breaking: Stock market hits record high amid economic recovery",
    "@friend lol that's so funny ðŸ˜‚ðŸ˜‚ðŸ˜‚ can't stop laughing",
    "Government announces new climate change policy",
    "OMG!!! You won't BELIEVE what happened today!!!",
    "Federal Reserve raises interest rates by 0.5%",
    "feeling sad today :( need a hug"
]

print("\n" + "="*80)
print("TESTING WITH REAL EXAMPLES")
print("="*80 + "\n")

for example in test_examples:
    label, confidence = predict_text(example, model, tokenizer, device)
    print(f"Text: {example}")
    print(f"Prediction: {label} (confidence: {confidence:.2%})")
    print("-" * 80 + "\n")