{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Siren: Baseline Model Training, Quantization, and ONNX Export\n",
    "\n",
    "This notebook covers the complete pipeline using the Hugging Face Optimum library for robust export.\n",
    "1.  **Setup**: Install dependencies, including Optimum.\n",
    "2.  **Training**: Fine-tune a DistilBERT model on the dataset.\n",
    "3.  **ONNX Export & Quantization**: Convert the model to ONNX and apply dynamic quantization simultaneously.\n",
    "4.  **Verification**: Load the ONNX model and test it.\n",
    "5.  **Download**: Download the final quantized model."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Uninstall potentially conflicting libraries first\n",
    "!pip uninstall -y torchvision torchaudio\n",
    "\n",
    "# Install and upgrade the libraries we need, including Optimum\n",
    "!pip install --upgrade transformers pandas torch\n",
    "!pip install --upgrade onnx onnxruntime\n",
    "!pip install --upgrade optimum[onnxruntime]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload the dummy_data.csv file\n",
    "from google.colab import files\n",
    "\n",
    "uploaded = files.upload()\n",
    "\n",
    "for fn in uploaded.keys():\n",
    "  print(f'User uploaded file \"{fn}\" with length {len(uploaded[fn])} bytes')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Model Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import torch\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification\n",
    "from torch.optim import AdamW\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def train_baseline_model(file_path):\n",
    "    # Load Data\n",
    "    df = pd.read_csv(file_path)\n",
    "    # Use the correct column names: 'text' and 'label'\n",
    "    train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
    "        df['text'], df['label'], test_size=0.2, random_state=42\n",
    "    )\n",
    "\n",
    "    # Tokenizer\n",
    "    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
    "\n",
    "    class PhishingDataset(Dataset):\n",
    "        def __init__(self, encodings, labels):\n",
    "            self.encodings = encodings\n",
    "            self.labels = labels\n",
    "\n",
    "        def __getitem__(self, idx):\n",
    "            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
    "            item['labels'] = torch.tensor(self.labels[idx])\n",
    "            return item\n",
    "\n",
    "        def __len__(self):\n",
    "            return len(self.labels)\n",
    "\n",
    "    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)\n",
    "    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)\n",
    "\n",
    "    train_dataset = PhishingDataset(train_encodings, list(train_labels))\n",
    "    val_dataset = PhishingDataset(val_encodings, list(val_labels))\n",
    "\n",
    "    # Model\n",
    "    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)\n",
    "    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
    "    model.to(device)\n",
    "\n",
    "    # Training\n",
    "    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
    "    optimizer = AdamW(model.parameters(), lr=5e-5)\n",
    "\n",
    "    model.train()\n",
    "    for epoch in range(3):  # 3 epochs for fine-tuning\n",
    "        for batch in train_loader:\n",
    "            optimizer.zero_grad()\n",
    "            input_ids = batch['input_ids'].to(device)\n",
    "            attention_mask = batch['attention_mask'].to(device)\n",
    "            labels = batch['labels'].to(device)\n",
    "            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
    "            loss = outputs.loss\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "        print(f'Epoch {epoch+1} | Loss: {loss.item()}')\n",
    "\n",
    "    print('Finished Training')\n",
    "    return model, tokenizer\n",
    "\n",
    "# Run training\n",
    "trained_model, tokenizer = train_baseline_model('dummy_data.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. ONNX Export & Quantization (The Correct Way)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification\n",
    "from optimum.onnxruntime.configuration import AutoQuantizationConfig\n",
    "\n",
    "# 1. Simpan dulu model yang sudah dilatih ke sebuah folder\n",
    "output_dir = \"./siren_model_files\"\n",
    "trained_model.save_pretrained(output_dir)\n",
    "tokenizer.save_pretrained(output_dir)\n",
    "print(f\"Model dan tokenizer sementara disimpan di {output_dir}\")\n",
    "\n",
    "# 2. Load model menggunakan ORTModelForSequenceClassification, yang akan meng-handle konversi ke ONNX\n",
    "onnx_model = ORTModelForSequenceClassification.from_pretrained(output_dir, export=True)\n",
    "print(f\"Model berhasil diekspor ke format ONNX.\")\n",
    "\n",
    "# 3. Buat Quantizer untuk model ONNX tersebut\n",
    "quantizer = ORTQuantizer.from_pretrained(onnx_model)\n",
    "\n",
    "# 4. Konfigurasi quantization (AVX2 untuk CPU, tipe dinamis)\n",
    "dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)\n",
    "\n",
    "# 5. Lakukan Quantization dan simpan ke DIREKTORI baru\n",
    "quantized_model_dir = os.path.join(output_dir, \"quantized_model\")\n",
    "quantizer.quantize(save_dir=quantized_model_dir, quantization_config=dqconfig)\n",
    "print(f\"Model ONNX berhasil di-quantize dan disimpan di direktori {quantized_model_dir}\")\n",
    "\n",
    "# 6. Atur path ke file model yang benar di dalam direktori tersebut\n",
    "# Nama file defaultnya adalah 'model_quantized.onnx'\n",
    "onnx_model_path = os.path.join(quantized_model_dir, \"model_quantized.onnx\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Verification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import onnxruntime\n",
    "import numpy as np\n",
    "\n",
    "# Siapkan input untuk verifikasi menggunakan tokenizer yang ada\n",
    "# \"return_tensors='np'\" akan membuat input dalam format numpy yang dibutuhkan onnxruntime\n",
    "verify_input = tokenizer(\"this is a sample url for verification\", return_tensors=\"np\")\n",
    "\n",
    "# Buat ONNX runtime session\n",
    "ort_session = onnxruntime.InferenceSession(onnx_model_path)\n",
    "\n",
    "# Jalankan inference dengan input yang baru dibuat\n",
    "ort_outs = ort_session.run(None, dict(verify_input))\n",
    "\n",
    "print('ONNX model loaded and verified successfully!')\n",
    "print('Output shape:', ort_outs[0].shape)\n",
    "print('Output logits:', ort_outs[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Download the ONNX Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from google.colab import files\n",
    "\n",
    "# Download file model ONNX yang sudah di-quantize\n",
    "files.download(onnx_model_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

## 1. Setup

In [None]:
# Install necessary libraries
!pip install transformers pandas torch
!pip install onnx onnxruntime

In [None]:
# Upload the dummy_data.csv file
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

## 2. Model Training

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

def train_baseline_model(file_path):
    # Load Data
    df = pd.read_csv(file_path)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['url'], df['is_phishing'], test_size=0.2, random_state=42
    )

    # Tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    class PhishingDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

    train_dataset = PhishingDataset(train_encodings, list(train_labels))
    val_dataset = PhishingDataset(val_encodings, list(val_labels))

    # Model
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Training
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=5e-5)

    model.train()
    for epoch in range(3):  # 3 epochs for fine-tuning
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1} | Loss: {loss.item()}')

    print('Finished Training')
    return model, tokenizer

# Run training
trained_model, tokenizer = train_baseline_model('dummy_data.csv')

## 3. Quantization

In [None]:
# Move model to CPU for quantization
trained_model.to('cpu')

# Apply dynamic quantization
quantized_model = torch.quantization.quantize_dynamic(
    trained_model, {torch.nn.Linear}, dtype=torch.qint8
)

print('Model successfully quantized.')
# You can print the model to see the difference
# print(trained_model)
# print(quantized_model)

## 4. ONNX Export

In [None]:
import torch

# Prepare a dummy input for the exporter
dummy_input = tokenizer('this is a sample url', return_tensors='pt')
input_ids = dummy_input['input_ids']
attention_mask = dummy_input['attention_mask']

onnx_model_path = 'siren_model.onnx'

# Export the model
torch.onnx.export(
    quantized_model, 
    (input_ids, attention_mask), 
    onnx_model_path, 
    export_params=True, 
    opset_version=11, 
    do_constant_folding=True, 
    input_names=['input_ids', 'attention_mask'],
    output_names=['output'],
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'},
                  'attention_mask': {0: 'batch_size', 1: 'sequence'},
                  'output': {0: 'batch_size'}}
)

print(f'Model exported to {onnx_model_path}')

## 5. Verification

In [None]:
import onnxruntime
import numpy as np

# Create an ONNX runtime session
ort_session = onnxruntime.InferenceSession(onnx_model_path)

# Prepare the dummy input in the format ONNX runtime expects (numpy arrays)
ort_inputs = {
    'input_ids': input_ids.numpy(),
    'attention_mask': attention_mask.numpy()
}

# Run inference
ort_outs = ort_session.run(None, ort_inputs)

print('ONNX model loaded and verified successfully!')
print('Output shape:', ort_outs[0].shape)
print('Output logits:', ort_outs[0])

## 6. Download the ONNX Model

In [None]:
from google.colab import files

files.download(onnx_model_path)