# Phase 2: Model Training Notebook

This notebook implements Phase 2 of the CheckDi project - training a WangchanBERTa model for Thai fake news detection.

In [None]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import torch
import logging
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
import joblib
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append('../')

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

In [None]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("Note: Training on CPU will be slower. Consider using GPU for better performance.")

In [None]:
# Load prepared data
print("Loading prepared data...")

try:
    # Load splits
    X_train = np.load('../data/processed/X_train.npy', allow_pickle=True)
    X_test = np.load('../data/processed/X_test.npy', allow_pickle=True)
    y_train = np.load('../data/processed/y_train.npy', allow_pickle=True)
    y_test = np.load('../data/processed/y_test.npy', allow_pickle=True)
    
    # Load label encoder
    label_encoder = joblib.load('../data/processed/label_encoder.pkl')
    
    print(f"✓ Training samples: {len(X_train)}")
    print(f"✓ Test samples: {len(X_test)}")
    print(f"✓ Label classes: {label_encoder.classes_}")
    
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please run the data preparation notebook first.")
    raise

In [None]:
# Model configuration
MODEL_NAME = "airesearch/wangchanberta-base-att-spm-uncased"
OUTPUT_DIR = "../models/wangchanberta-finetuned-afnc"
MAX_LENGTH = 256
BATCH_SIZE = 16  # Adjust based on GPU memory
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100

print(f"Model: {MODEL_NAME}")
print(f"Max sequence length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_EPOCHS}")

In [None]:
# Load tokenizer and model
print("Loading tokenizer and model...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_encoder.classes_),
    problem_type="single_label_classification"
)

# Move model to device
model.to(device)

print(f"✓ Model loaded with {model.num_labels} output labels")
print(f"✓ Model moved to {device}")

In [None]:
# Create custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("Dataset class defined")

In [None]:
# Create datasets
print("Creating datasets...")

train_dataset = NewsDataset(X_train, y_train, tokenizer, MAX_LENGTH)
test_dataset = NewsDataset(X_test, y_test, tokenizer, MAX_LENGTH)

print(f"✓ Training dataset: {len(train_dataset)} samples")
print(f"✓ Test dataset: {len(test_dataset)} samples")

# Test dataset creation
sample = train_dataset[0]
print(f"✓ Sample input shape: {sample['input_ids'].shape}")
print(f"✓ Sample label: {sample['labels']} ({label_encoder.classes_[sample['labels']]})")

In [None]:
# Define evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
    }

print("Evaluation metrics defined")

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    weight_decay=0.01,
    learning_rate=LEARNING_RATE,
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to=None,  # Disable wandb/tensorboard
    push_to_hub=False
)

print("Training arguments configured")
print(f"Output directory: {OUTPUT_DIR}")

In [None]:
# Create trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer created successfully")

In [None]:
# Train the model
print("Starting model training...")
print("Note: This may take several minutes to hours depending on your hardware.")

# Train
train_result = trainer.train()

print("\n" + "="*50)
print("TRAINING COMPLETED")
print("="*50)
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training runtime: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training samples per second: {train_result.metrics['train_samples_per_second']:.2f}")

In [None]:
# Evaluate the model
print("Evaluating model on test set...")

eval_result = trainer.evaluate()

print("\n" + "="*50)
print("EVALUATION RESULTS")
print("="*50)
print(f"Test accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"Test loss: {eval_result['eval_loss']:.4f}")

In [None]:
# Generate detailed predictions and metrics
print("Generating detailed evaluation metrics...")

# Get predictions
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Classification report
target_names = label_encoder.classes_
report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=target_names))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.savefig('../data/visualization/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Save detailed results
results = {
    'accuracy': report['accuracy'],
    'macro_avg': report['macro avg'],
    'weighted_avg': report['weighted avg'],
    'per_class': {target_names[i]: report[target_names[i]] for i in range(len(target_names))}
}

# Save results to file
import json
with open(f'{OUTPUT_DIR}/evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✓ Results saved to {OUTPUT_DIR}/evaluation_results.json")

In [None]:
# Save the final model and tokenizer
print("Saving final model and tokenizer...")

# Save model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Save additional artifacts
model_info = {
    'model_name': MODEL_NAME,
    'max_length': MAX_LENGTH,
    'num_labels': len(label_encoder.classes_),
    'label_classes': label_encoder.classes_.tolist(),
    'final_accuracy': eval_result['eval_accuracy'],
    'training_samples': len(X_train),
    'test_samples': len(X_test)
}

with open(f'{OUTPUT_DIR}/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

# Also save the label encoder
joblib.dump(label_encoder, f'{OUTPUT_DIR}/label_encoder.pkl')

print(f"✓ Model saved to {OUTPUT_DIR}")
print(f"✓ Model info saved to {OUTPUT_DIR}/model_info.json")
print(f"✓ Label encoder saved to {OUTPUT_DIR}/label_encoder.pkl")

In [None]:
# Test the saved model with sample predictions
print("Testing saved model with sample predictions...")

# Load the saved model
saved_model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR)
saved_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
saved_model.to(device)
saved_model.eval()

# Test samples (mix of real and fake news headlines in Thai)
test_samples = [
    "รัฐบาลเปิดเผยแผนพัฒนาเศรษฐกิจในปีหน้า",  # Real news
    "พบยารักษาโรคเบาหวานใหม่ที่มีประสิทธิภาพสูง",  # Real news
    "วิทยาศาสตร์ใหม่พบว่ากินใบย่านางช่วยลดน้ำหนักได้ภายใน 1 สัปดาห์",  # Fake news
    "พบว่าน้ำมันมะพร้าวสามารถรักษาโรคมะเร็งได้ 100%"  # Fake news
]

print("\nSample Predictions:")
print("-" * 80)

for i, text in enumerate(test_samples, 1):
    # Tokenize
    inputs = saved_tokenizer(
        text, 
        return_tensors="pt", 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Predict
    with torch.no_grad():
        outputs = saved_model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][predicted_class].item()
    
    predicted_label = label_encoder.classes_[predicted_class]
    
    print(f"{i}. Text: {text[:50]}...")
    print(f"   Prediction: {predicted_label} (Confidence: {confidence:.3f})")
    print(f"   Probabilities: Real={probabilities[0][1]:.3f}, Fake={probabilities[0][0]:.3f}")
    print()

print("✓ Model testing completed successfully!")

## Phase 2 Training Summary

This notebook has completed Phase 2 of the CheckDi project:

1. **Model Setup**: Loaded WangchanBERTa for Thai text classification
2. **Data Preparation**: Created PyTorch datasets from preprocessed data
3. **Training**: Fine-tuned the model on AFNC fake news dataset
4. **Evaluation**: Generated comprehensive evaluation metrics
5. **Model Saving**: Saved the trained model, tokenizer, and metadata
6. **Testing**: Validated the saved model with sample predictions

### Next Steps

- The trained model is now ready for use in the Streamlit application
- The predictor module will load this saved model for real-time predictions
- Consider further fine-tuning with additional data or different hyperparameters

### Files Generated

- `models/wangchanberta-finetuned-afnc/`: Complete trained model
- `models/wangchanberta-finetuned-afnc/model_info.json`: Model metadata
- `models/wangchanberta-finetuned-afnc/evaluation_results.json`: Detailed evaluation metrics
- `data/visualization/confusion_matrix.png`: Confusion matrix visualization