<a href="https://colab.research.google.com/github/DaraRahma536/Finalterm_DL/blob/main/Task1_AGNews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FINE-TUNING HUGGINGFACE MODELS (AGNews)**

## **1. Setup dan Instalasi**

In [None]:
# 1. Setup dan Instalasi
!pip install transformers datasets torch scikit-learn pandas numpy matplotlib seaborn evaluate

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset, load_dataset, load_metric
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## **2. Load Dataset**

In [None]:
# 2. Load Dataset - AG News
print("Loading AG News dataset...")
dataset = load_dataset("sh0416/ag_news")

# Check dataset structure
print("\nDataset structure:")
print(dataset)
print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")
print("\nSample data:")
print(dataset['train'][0])

# Check class distribution
def check_class_distribution(dataset_split, split_name):
    labels = dataset_split['label']
    unique, counts = np.unique(labels, return_counts=True)
    print(f"\n{split_name} class distribution:")
    for label, count in zip(unique, counts):
        print(f"  Class {label}: {count} samples ({count/len(labels)*100:.1f}%)")

check_class_distribution(dataset['train'], 'Training')
check_class_distribution(dataset['test'], 'Test')

## **3. Preprocessing Data**

In [None]:
# 3. Preprocessing dan Tokenization
MODEL_NAME = "bert-base-uncased"  # Bisa diganti dengan "distilbert-base-uncased" atau "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=256
    )

print("\nTokenizing dataset...")
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

# 4. Split dataset (gunakan validation dari train)
train_val = tokenized_datasets['train'].train_test_split(test_size=0.1, seed=42)
tokenized_datasets['train'] = train_val['train']
tokenized_datasets['val'] = train_val['test']

## **4. Load Model dan Training**

In [None]:
# 5. Load Model
print(f"\nLoading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=4  # AG News memiliki 4 kelas
)
model.to(device)

# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./results_agnews",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    save_total_limit=2,
    push_to_hub=False
)

# 7. Metrics Function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate metrics
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        "accuracy": acc,
        "f1": f1,
    }

# 8. Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 9. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 10. Train Model
print("\nTraining model...")
train_result = trainer.train()

## **5. Evaluasi**

In [None]:
# 11. Evaluate on Validation Set
print("\nEvaluating model on validation set...")
eval_result = trainer.evaluate()
print(f"\nValidation results:")
for key, value in eval_result.items():
    print(f"  {key}: {value:.4f}")

# 12. Test on Test Set
print("\nTesting on test set...")
test_results = trainer.predict(tokenized_datasets['test'])
test_metrics = test_results.metrics
print(f"\nTest set metrics:")
for key, value in test_metrics.items():
    if key not in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second']:
        print(f"  {key}: {value:.4f}")

In [1]:
# 13. Save Model
print("\nSaving model...")
trainer.save_model("./saved_model_agnews")
tokenizer.save_pretrained("./saved_model_agnews")

# 14. Visualization
def plot_confusion_matrix(y_true, y_pred, labels, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('confusion_matrix_agnews.png')
    plt.show()

def plot_training_history(trainer_state):
    if trainer_state.log_history:
        history = pd.DataFrame(trainer_state.log_history)

        # Plot loss
        plt.figure(figsize=(12, 4))

        plt.subplot(1, 2, 1)
        if 'loss' in history.columns:
            train_loss = history[history['loss'].notna()]
            plt.plot(train_loss['step'], train_loss['loss'], label='Training Loss')
        if 'eval_loss' in history.columns:
            eval_loss = history[history['eval_loss'].notna()]
            plt.plot(eval_loss['step'], eval_loss['eval_loss'], label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Steps')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)

        plt.subplot(1, 2, 2)
        if 'eval_accuracy' in history.columns:
            eval_acc = history[history['eval_accuracy'].notna()]
            plt.plot(eval_acc['step'], eval_acc['eval_accuracy'], label='Validation Accuracy', color='green')
        plt.title('Validation Accuracy')
        plt.xlabel('Steps')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig('training_history.png')
        plt.show()

# Get predictions for confusion matrix
predictions = np.argmax(test_results.predictions, axis=1)
labels = test_results.label_ids

# Class names for AG News
class_names = ["World", "Sports", "Business", "Sci/Tech"]

# Generate classification report
print("\nDetailed Classification Report:")
print(classification_report(labels, predictions, target_names=class_names, digits=4))

# Plot confusion matrix
plot_confusion_matrix(labels, predictions, class_names, 'AG News Classification - Confusion Matrix')

# Plot training history
plot_training_history(trainer.state)

# 15. Inference Function
def predict_news_category(text, model, tokenizer, device, class_names):
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions, dim=1).item()
    probabilities = predictions[0].cpu().numpy()

    # Get top 3 predictions
    top3_indices = np.argsort(probabilities)[-3:][::-1]
    top3_predictions = [(class_names[i], probabilities[i]) for i in top3_indices]

    return {
        "text": text[:100] + "..." if len(text) > 100 else text,
        "predicted_class": predicted_class,
        "predicted_label": class_names[predicted_class],
        "confidence": probabilities[predicted_class],
        "probabilities": probabilities,
        "top3_predictions": top3_predictions
    }

# Test inference
test_samples = [
    "Apple announces new iPhone with advanced AI features and improved battery life",
    "Manchester United wins championship after dramatic final match",
    "Federal Reserve raises interest rates to combat inflation",
    "Scientists discover new exoplanet that could support life"
]

print("\nTest Predictions:")
print("=" * 80)
for text in test_samples:
    result = predict_news_category(text, model, tokenizer, device, class_names)
    print(f"\nText: {result['text']}")
    print(f"Predicted: {result['predicted_label']} (Class {result['predicted_class']})")
    print(f"Confidence: {result['confidence']:.2%}")
    print("Top 3 predictions:")
    for label, prob in result['top3_predictions']:
        print(f"  - {label}: {prob:.2%}")

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


KeyboardInterrupt: 