# Siren - Baseline Model Training (DistilBERT)

This notebook is for training the baseline phishing detection model using DistilBERT on Google Colab.

**Steps:**
1.  Run the first code cell to install the necessary libraries.
2.  Run the second code cell and upload the [dummy_data.csv](cci:7://file:///d:/Project%20Utama/Rust/Siren/model/data/dummy_data.csv:0:0-0:0) file when prompted.
3.  Run the final cell to start the training process.

In [None]:
# 1. Install Dependencies
!pip install -q transformers pandas torch

In [None]:
# 2. Upload Data
from google.colab import files

print('Please upload the dummy_data.csv file.')
uploaded = files.upload()

# Check if the file was uploaded
for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')
  DATA_PATH = fn

In [None]:
# 3. Training Script
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW

# Configuration
MODEL_NAME = 'distilbert-base-uncased'
NUM_EPOCHS = 3
BATCH_SIZE = 2
LEARNING_RATE = 5e-5

# Custom Dataset Class
class PhishingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Main Training Function
def train_model():
    print("--- Starting Baseline Model Training ---")

    tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
    model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    df = pd.read_csv(DATA_PATH)
    print(f"Loaded {len(df)} records from {DATA_PATH}")

    dataset = PhishingDataset(
        texts=df.text.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer
    )
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    print(f"Using device: {device}")
    print("--- Setup complete. Starting training loop ---")

    model.train()
    for epoch in range(NUM_EPOCHS):
        print(f'\nEpoch {epoch + 1}/{NUM_EPOCHS}')
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print(f"  - Batch processed. Loss: {loss.item():.4f}")

    print("\n--- Baseline model training finished ---")

# Run training
train_model()