In [1]:
!pip install -U "transformers>=4.40.0" "datasets>=2.18.0" "accelerate>=0.30.0"



In [2]:
# Cell 1: imports & basic setup

import json
import random
from pathlib import Path
from urllib.parse import urlparse

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
)

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Paths (adjust if needed)
DATA_PATH = Path("data.json")
BLACKLIST_PATH = Path("blacklisted_domains.json")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


device(type='cpu')

In [3]:
# Cell 2: load sessions, URLs, and blacklist

with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

sessions = raw_data["sessions"]     # list of {session_id, description}
urls = raw_data["urls"]             # list of {id, url, session_fit, title_summary}

print(f"#sessions: {len(sessions)}, #urls: {len(urls)}")

with open(BLACKLIST_PATH, "r", encoding="utf-8") as f:
    blacklist_data = json.load(f)

blacklisted_domains = set(blacklist_data["domains"])
print(f"#blacklisted domains: {len(blacklisted_domains)}")


#sessions: 5, #urls: 60
#blacklisted domains: 244


In [4]:
# Cell 3: build (session, url) pairs with binary labels

examples = []

def compute_label(session_id: int, session_fit: str) -> int:
    """
    session_fit: "*", "x", or "1".."5"
    Return 1 if this URL should be allowed for this session_id, else 0.
    """
    if session_fit == "*":
        return 1
    if session_fit == "x":
        return 0
    # Otherwise it should be a specific session number as a string
    try:
        fit_id = int(session_fit)
    except ValueError:
        raise ValueError(f"Unexpected session_fit value: {session_fit}")
    return 1 if session_id == fit_id else 0

for s in sessions:
    sid = s["session_id"]
    s_text = s["description"]

    for u in urls:
        label = compute_label(sid, u["session_fit"])
        examples.append(
            {
                "session_id": sid,
                "session_text": s_text,
                "url_id": u["id"],
                "url": u["url"],
                "title": u["title"],
                "label": label,
            }
        )

len(examples)


300

In [5]:
# Cell 4: DataFrame + label distribution

df = pd.DataFrame(examples)
print(df.head())

label_counts = df["label"].value_counts().sort_index()
print("\nLabel distribution (0=Block, 1=Allow):")
print(label_counts)

print("\nProportion of 'allow' examples:", label_counts[1] / len(df))


   session_id                                       session_text  url_id  \
0           1  I have to implement Google Login and send our ...       1   
1           1  I have to implement Google Login and send our ...       2   
2           1  I have to implement Google Login and send our ...       3   
3           1  I have to implement Google Login and send our ...       4   
4           1  I have to implement Google Login and send our ...       5   

                               url             title  label  
0         https://www.youtube.com/           YouTube      1  
1       https://www.wikipedia.org/         Wikipedia      1  
2          https://www.google.com/            Google      1  
3             https://chatgpt.com/           ChatGPT      0  
4  https://store.steampowered.com/  Welcome to Steam      0  

Label distribution (0=Block, 1=Allow):
label
0    256
1     44
Name: count, dtype: int64

Proportion of 'allow' examples: 0.14666666666666667


In [6]:
# Cell 5: train / val / test split (70 / 10 / 20)

train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    random_state=SEED,
    stratify=df["label"],
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=2/3,   # 20% test, 10% val overall (because 0.3 * 2/3 = 0.2)
    random_state=SEED,
    stratify=temp_df["label"],
)

def describe_split(name, split_df):
    counts = split_df["label"].value_counts().sort_index()
    total = len(split_df)
    print(f"{name}: n={total}, label counts={dict(counts)}, allow%={counts[1] / total:.3f}")

describe_split("Train", train_df)
describe_split("Val", val_df)
describe_split("Test", test_df)

Train: n=210, label counts={0: np.int64(179), 1: np.int64(31)}, allow%=0.148
Val: n=30, label counts={0: np.int64(26), 1: np.int64(4)}, allow%=0.133
Test: n=60, label counts={0: np.int64(51), 1: np.int64(9)}, allow%=0.150


In [7]:
# Cell 6: baseline using domain blacklist only

def extract_domain(url: str) -> str:
    """
    Extract a normalized hostname from a URL (e.g. 'www.youtube.com' -> 'youtube.com').
    """
    parsed = urlparse(url)
    host = parsed.netloc.lower()
    if host.startswith("www."):
        host = host[4:]
    return host

def is_blacklisted(url: str, blacklisted: set) -> bool:
    """
    Return True if the URL's host (or parent domain) is in the blacklist.
    """
    host = extract_domain(url)
    parts = host.split(".")

    # Generate candidate domains: full host, then drop left parts
    candidates = [".".join(parts[i:]) for i in range(len(parts) - 1)]
    for cand in candidates:
        if cand in blacklisted:
            return True
    return False

def baseline_predict(url: str, blacklisted: set) -> int:
    """
    Very simple baseline: if URL domain is in blacklist, block (0), else allow (1).
    Ignores the session context and the title.
    """
    return 0 if is_blacklisted(url, blacklisted) else 1

In [8]:
# Cell 7: build input text and convert to HF Dataset

def build_input_text(row):
    return f"Session: {row['session_text']} [SEP] Page: {row['title']}"

for split_df in (train_df, val_df, test_df):
    split_df["text"] = split_df.apply(build_input_text, axis=1)

# Quick sanity check
train_df[["session_id", "url", "title", "text"]].head()

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["text", "label"]].reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df[["text", "label"]].reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df[["text", "label"]].reset_index(drop=True))

train_dataset[0]


{'text': 'Session: I am currently studying for Cloud Computing finals. We learnt about Kubernetes, and CI/CD with GitHub Actions plus Terraform IaC. While studying, I also have to implement toy examples from the lecture for further understanding. [SEP] Page: Health Problems Caused by Secondhand Smoke - CDC',
 'label': 0}

In [9]:
# Cell 8: tokenizer and tokenized datasets

MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_train = train_dataset.map(tokenize_batch, batched=True)
tokenized_val   = val_dataset.map(tokenize_batch, batched=True)
tokenized_test  = test_dataset.map(tokenize_batch, batched=True)

# Set format for PyTorch
for dset in (tokenized_train, tokenized_val, tokenized_test):
    dset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

tokenized_train[0]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

{'label': tensor(0),
 'input_ids': tensor([  101,  5219,  1024,  1045,  2572,  2747,  5702,  2005,  6112,  9798,
          4399,  1012,  2057, 20215,  2055, 13970,  5677,  7159,  2229,  1010,
          1998, 25022,  1013,  3729,  2007, 21025,  2705, 12083,  4506,  4606,
         14403, 14192, 24264,  2278,  1012,  2096,  5702,  1010,  1045,  2036,
          2031,  2000, 10408,  9121,  4973,  2013,  1996,  8835,  2005,  2582,
          4824,  1012,   102,  3931,  1024,  2740,  3471,  3303,  2011,  2117,
         11774,  5610,  1011, 26629,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

In [10]:
# Cell 9: model + DataLoaders + optimizer (no Trainer)

from torch.utils.data import DataLoader

num_labels = 2
MODEL_NAME = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
)
model.to(DEVICE)

# DataLoaders from the tokenized datasets
train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True)
val_loader   = DataLoader(tokenized_val,   batch_size=16, shuffle=False)
test_loader  = DataLoader(tokenized_test,  batch_size=16, shuffle=False)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

print("Using device:", DEVICE)
print("Train batches:", len(train_loader),
      "Val batches:", len(val_loader),
      "Test batches:", len(test_loader))


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu
Train batches: 27 Val batches: 2 Test batches: 4


In [11]:
# Cell 10: training loop with early stopping based on validation loss

def train_one_epoch(loader):
    model.train()
    total_loss = 0.0
    total_examples = 0

    for batch in loader:
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["label"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        batch_size = input_ids.size(0)
        total_loss += loss.item() * batch_size
        total_examples += batch_size

    return total_loss / total_examples


@torch.no_grad()
def evaluate_loss(loader):
    model.eval()
    total_loss = 0.0
    total_examples = 0

    for batch in loader:
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["label"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss

        batch_size = input_ids.size(0)
        total_loss += loss.item() * batch_size
        total_examples += batch_size

    return total_loss / total_examples


max_epochs = 15      # upper bound on epochs
patience   = 3       # stop if val loss doesn't improve for 3 epochs
best_val_loss = float("inf")
epochs_without_improve = 0
best_state_dict = None

for epoch in range(1, max_epochs + 1):
    train_loss = train_one_epoch(train_loader)
    val_loss   = evaluate_loss(val_loader)

    print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")

    if val_loss < best_val_loss - 1e-4:  # small margin to avoid noise
        best_val_loss = val_loss
        epochs_without_improve = 0
        best_state_dict = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    else:
        epochs_without_improve += 1
        if epochs_without_improve >= patience:
            print(f"Early stopping triggered after {epoch} epochs.")
            break

# Load best weights back into the model
if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    model.to(DEVICE)
    print("Loaded best model state with val_loss =", best_val_loss)


Epoch 1: train_loss=0.4418, val_loss=0.4208
Epoch 2: train_loss=0.4234, val_loss=0.3514
Epoch 3: train_loss=0.3511, val_loss=0.1893
Epoch 4: train_loss=0.3014, val_loss=0.3595
Epoch 5: train_loss=0.1999, val_loss=0.2429
Epoch 6: train_loss=0.1113, val_loss=0.2893
Early stopping triggered after 6 epochs.
Loaded best model state with val_loss = 0.18925433059533436


In [12]:
# Cell 11: TP / TN / FP / FN for model and baseline on TEST

@torch.no_grad()
def get_confusion_counts(loader):
    model.eval()
    all_labels = []
    all_preds = []

    for batch in loader:
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["label"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

    y_true = np.concatenate(all_labels)
    y_pred = np.concatenate(all_preds)

    # positive class = 1 ("allow")
    tp = int(((y_pred == 1) & (y_true == 1)).sum())
    tn = int(((y_pred == 0) & (y_true == 0)).sum())
    fp = int(((y_pred == 1) & (y_true == 0)).sum())
    fn = int(((y_pred == 0) & (y_true == 1)).sum())

    return tp, tn, fp, fn, y_true, y_pred


tp_m, tn_m, fp_m, fn_m, y_true, y_model = get_confusion_counts(test_loader)

print("=== Transformer model confusion counts on TEST (positive = allow) ===")
print(f"TP: {tp_m}")
print(f"TN: {tn_m}")
print(f"FP: {fp_m}")
print(f"FN: {fn_m}")

# Baseline confusion counts on the same TEST examples
y_true_base = test_df["label"].to_numpy()
y_base = np.array([
    baseline_predict(u, blacklisted_domains)
    for u in test_df["url"].tolist()
])

tp_b = int(((y_base == 1) & (y_true_base == 1)).sum())
tn_b = int(((y_base == 0) & (y_true_base == 0)).sum())
fp_b = int(((y_base == 1) & (y_true_base == 0)).sum())
fn_b = int(((y_base == 0) & (y_true_base == 1)).sum())

print("\n=== Baseline confusion counts on TEST (positive = allow) ===")
print(f"TP: {tp_b}")
print(f"TN: {tn_b}")
print(f"FP: {fp_b}")
print(f"FN: {fn_b}")


=== Transformer model confusion counts on TEST (positive = allow) ===
TP: 4
TN: 49
FP: 2
FN: 5

=== Baseline confusion counts on TEST (positive = allow) ===
TP: 7
TN: 8
FP: 43
FN: 2


In [13]:
# Cell 12: qualitative examples / error analysis

# Attach predictions back to original test_df (align by index)
test_df_with_preds = test_df.copy().reset_index(drop=True)
test_df_with_preds["true_label"] = y_true
test_df_with_preds["baseline_pred"] = y_base
test_df_with_preds["model_pred"] = y_model

def show_examples(df, n=8, condition=None, title="Examples"):
    print("\n" + "=" * 20, title, "=" * 20)

    subset = df if condition is None else df[condition]
    subset_size = len(subset)

    if subset_size == 0:
        print("No examples match this condition.")
        return

    n_to_sample = min(n, subset_size)
    rows = subset.sample(n=n_to_sample, random_state=SEED)

    for i, row in rows.iterrows():
        print(f"\n--- Example {i} ---")
        print(f"Session {row['session_id']}: {row['session_text']}")
        print(f"URL:   {row['url']}")
        print(f"Title: {row['title']}")
        print(f"True label:       {row['true_label']}  (1=allow, 0=block)")
        print(f"Baseline predict: {row['baseline_pred']}")
        print(f"Model predict:    {row['model_pred']}")

# Cases where model is correct and baseline is wrong
cond_model_better = (
    (test_df_with_preds["model_pred"] == test_df_with_preds["true_label"]) &
    (test_df_with_preds["baseline_pred"] != test_df_with_preds["true_label"])
)

show_examples(
    test_df_with_preds,
    n=5,
    condition=cond_model_better,
    title="Model correct, baseline wrong"
)

# Cases where Baseline correct, model wrong
cond_baseline_better = (
    (test_df_with_preds["baseline_pred"] == test_df_with_preds["true_label"]) &
    (test_df_with_preds["model_pred"] != test_df_with_preds["true_label"])
)
show_examples(
    test_df_with_preds,
    n=5,
    condition=cond_baseline_better,
    title="Baseline correct, model wrong"
)

# Cases where both model and baseline are wrong
cond_both_wrong = (
    (test_df_with_preds["model_pred"] != test_df_with_preds["true_label"]) &
    (test_df_with_preds["baseline_pred"] != test_df_with_preds["true_label"])
)

show_examples(
    test_df_with_preds,
    n=5,
    condition=cond_both_wrong,
    title="Both model and baseline wrong"
)




--- Example 52 ---
Session 1: I have to implement Google Login and send our calendar schedule to Google Calendar or receive schedule from Google Calendar.
URL:   https://nlpinkorean.github.io/illustrated-transformer/
Title: The Illustrated Transformer - NLP in Korean
True label:       0  (1=allow, 0=block)
Baseline predict: 1
Model predict:    0

--- Example 35 ---
Session 2: I am currently studying for Deep Neural Network finals. The course covers mathematical foundations and newer models such as Attention, GAN, LSTM, RLHF, and seq2seq, but does not focus on practical implementation.
URL:   https://platform.openai.com/docs/concepts
Title: Model concepts - OpenAI API
True label:       0  (1=allow, 0=block)
Baseline predict: 1
Model predict:    0

--- Example 36 ---
Session 1: I have to implement Google Login and send our calendar schedule to Google Calendar or receive schedule from Google Calendar.
URL:   https://www.aeccglobal.com/advice/10-best-ai-tools-for-students
Title: 10 Best 