#Step 0: Installation of Packages


In [None]:
!pip uninstall -y wandb     # avoid experiment tracking
!pip install transformers[torch] -q
!pip install dataset -q
!pip install evaluate -q
!pip install evaluate -q


Found existing installation: wandb 0.19.8
Uninstalling wandb-0.19.8:
  Successfully uninstalled wandb-0.19.8
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/664.8 MB[0m [31m49.4 MB/s[0m eta [36m0:00:10[0m

#Step 1: Obtain your own dataset-bgspaditya/byt-malicious-url-treatment | 載入惡意網址分類資料

In [None]:
from datasets import load_dataset

ds = load_dataset("bgspaditya/byt-malicious-url-treatment")

In [None]:
from datasets import Dataset, DatasetDict, load_dataset

# Load train and test splits separately | # 載入訓練與測試資料
train_dataset = load_dataset("bgspaditya/byt-malicious-url-treatment", split="train[100000:]")
test_dataset = load_dataset("bgspaditya/byt-malicious-url-treatment", split="test[:100000]")
# Create a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
})

print(f"data type = type(dataset)")

In [None]:
# Dataset structure
dataset

#Step 2: Create the model and tokenizer objects | 建立模型與 tokenizer

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

# Load a pre-trained transformer model for classification task (DistilBERT)
# 載入 DistilBERT 預訓練模型並設為分類任務模型（此處指定4個類別）

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased",num_labels=4) # Define number of output classes | 輸出分類數量=4

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

#Step 3: Generate Dataset for Funetuning | 建立微調



In [None]:
type(dataset)

In [None]:
from transformers import AutoTokenizer
# 'Please categlorize the url into the following type: Benign, defacement, malware, phishing',
# Tokenization function for processing URL input and attaching labels
# 定義 tokenization 函數：處理 URL 欄位並加上對應的標籤（type_code）
def tokenize_function(examples):
    appended_example = [f'Please categorize the url into the following type: Benign, defacement, malware, phishing. URL:{e}' for e in examples["url"]]
    tokenized_inputs = tokenizer(appended_example, padding="max_length", truncation=True, max_length=256)
    tokenized_inputs["labels"] = examples["type_code"] # Add label column required by model

    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(15000)) # Selected top 15,000 pens for training set | 訓練集選取前15000筆
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10000))  # Selected top 10,000 pens for testing set | 驗證集選取前10000筆


In [None]:
print(small_train_dataset[5])

#Step 4a: Finetune the pre-trained model

In [None]:
from transformers import TrainingArguments
from evaluate import load
from transformers import TrainingArguments, Trainer
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"

# Load accuracy | 載入準確率
metric = load("accuracy")

# Define metric computation function | 定義計算評估指標的函數

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Set training arguments | 設定模型訓練參數
training_args = TrainingArguments(
    output_dir="test_trainer",
    num_train_epochs=4,
    eval_strategy="epoch",
    weight_decay = 0.1

)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

# Start model training | 開始訓練模型
trainer.train()

#Step 4b: Evaluate the finetuned model

In [None]:
trainer.evaluate()

#Step 4c: Save the finetuned model

In [None]:
trainer.save_model('./fine_tuned_model')
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
!zip -r fine_tuned_model.zip fine_tuned_model

  adding: fine_tuned_model/ (stored 0%)
  adding: fine_tuned_model/config.json (deflated 50%)
  adding: fine_tuned_model/special_tokens_map.json (deflated 42%)
  adding: fine_tuned_model/tokenizer.json (deflated 71%)
  adding: fine_tuned_model/vocab.txt (deflated 53%)
  adding: fine_tuned_model/training_args.bin (deflated 51%)
  adding: fine_tuned_model/tokenizer_config.json (deflated 75%)
  adding: fine_tuned_model/model.safetensors (deflated 8%)


In [None]:
!unzip "/content/fine_tuned_model.zip" -d "/content/fine_tuned_model/"

Archive:  /content/fine_tuned_model.zip
   creating: /content/fine_tuned_model/fine_tuned_model/
  inflating: /content/fine_tuned_model/fine_tuned_model/config.json  
  inflating: /content/fine_tuned_model/fine_tuned_model/special_tokens_map.json  
  inflating: /content/fine_tuned_model/fine_tuned_model/tokenizer.json  
  inflating: /content/fine_tuned_model/fine_tuned_model/vocab.txt  
  inflating: /content/fine_tuned_model/fine_tuned_model/training_args.bin  
  inflating: /content/fine_tuned_model/fine_tuned_model/tokenizer_config.json  
  inflating: /content/fine_tuned_model/fine_tuned_model/model.safetensors  


#Step 5: Test the saved model

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Load Fine-tuned Model | 載入先前訓練好的模型
model_path = "./fine_tuned_model/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create Classification Pipeline | 建立分類任務的 pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Define Test Samples
test_domains = [
    "pornhub.com/video?c=b",                        # Safe Website | 安全網站（Benign）
    "http://bon-dom.com/poleznyie-stati/index.html",            # Suspicious website | 可疑網站（可能是惡意）
    "apple-verfiy.com",                          # Look like Apple, but actually suspicious | 看似 Apple，但其實可疑
    "meetyourneighbour.ca"                         # Suspicious url 可疑域名
]

# Define label map | 對模型輸出的 label 編號做轉換（對應類別名稱）
label_map = {
    0: "Benign",      # 安全網站
    1: "defacement",    # 網頁被篡改
    2: "malware",      # 惡意軟件
    3: "phishing"      # 網絡釣魚
}

# Run Inference and Pretty Print Results
print("=== Inference Results ===\n")
for domain in test_domains:
    result = classifier(domain)[0]
    print(result)
    # Parses label, removes “LABEL_” and converts to an integer. | 解析 label，去掉 "LABEL_" 並轉換成整數
    label_int = int(result['label'].replace("LABEL_", ""))

    # Get the corresponding text tag | 獲取對應的的文本標簽
    label = label_map.get(label_int, "Unknown")

    # Obtain the confidence level of the model | 獲取模型的置信度
    score = result['score']

    print(f"Domain: {domain:<35} → Prediction: {label:<20} | Score: {score:.4f}")





#Upload | 上傳

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 

In [None]:
repo_name = "Eason918/malicious-url-detector"

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)