# Siren: Baseline Model Training, Quantization, and ONNX Export

This notebook covers the complete pipeline:
1.  **Setup**: Install dependencies and upload data.
2.  **Training**: Fine-tune a DistilBERT model on the dataset.
3.  **Quantization**: Apply dynamic quantization to reduce model size and speed up inference.
4.  **ONNX Export**: Convert the quantized model to the ONNX format for cross-platform use.
5.  **Verification**: Load the ONNX model and test it to ensure the export was successful.

## 1. Setup

In [None]:
# Install necessary libraries
!pip install transformers pandas torch
!pip install onnx onnxruntime

In [None]:
# Upload the dummy_data.csv file
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

## 2. Model Training

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

def train_baseline_model(file_path):
    # Load Data
    df = pd.read_csv(file_path)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['url'], df['is_phishing'], test_size=0.2, random_state=42
    )

    # Tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    class PhishingDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

    train_dataset = PhishingDataset(train_encodings, list(train_labels))
    val_dataset = PhishingDataset(val_encodings, list(val_labels))

    # Model
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Training
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=5e-5)

    model.train()
    for epoch in range(3):  # 3 epochs for fine-tuning
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1} | Loss: {loss.item()}')

    print('Finished Training')
    return model, tokenizer

# Run training
trained_model, tokenizer = train_baseline_model('dummy_data.csv')

## 3. Quantization

In [None]:
# Move model to CPU for quantization
trained_model.to('cpu')

# Apply dynamic quantization
quantized_model = torch.quantization.quantize_dynamic(
    trained_model, {torch.nn.Linear}, dtype=torch.qint8
)

print('Model successfully quantized.')
# You can print the model to see the difference
# print(trained_model)
# print(quantized_model)

## 4. ONNX Export

In [None]:
import torch

# Prepare a dummy input for the exporter
dummy_input = tokenizer('this is a sample url', return_tensors='pt')
input_ids = dummy_input['input_ids']
attention_mask = dummy_input['attention_mask']

onnx_model_path = 'siren_model.onnx'

# Export the model
torch.onnx.export(
    quantized_model, 
    (input_ids, attention_mask), 
    onnx_model_path, 
    export_params=True, 
    opset_version=11, 
    do_constant_folding=True, 
    input_names=['input_ids', 'attention_mask'],
    output_names=['output'],
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'},
                  'attention_mask': {0: 'batch_size', 1: 'sequence'},
                  'output': {0: 'batch_size'}}
)

print(f'Model exported to {onnx_model_path}')

## 5. Verification

In [None]:
import onnxruntime
import numpy as np

# Create an ONNX runtime session
ort_session = onnxruntime.InferenceSession(onnx_model_path)

# Prepare the dummy input in the format ONNX runtime expects (numpy arrays)
ort_inputs = {
    'input_ids': input_ids.numpy(),
    'attention_mask': attention_mask.numpy()
}

# Run inference
ort_outs = ort_session.run(None, ort_inputs)

print('ONNX model loaded and verified successfully!')
print('Output shape:', ort_outs[0].shape)
print('Output logits:', ort_outs[0])

## 6. Download the ONNX Model

In [None]:
from google.colab import files

files.download(onnx_model_path)