In [None]:
!pip install -q transformers datasets accelerate scikit-learn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = "/content/drive/MyDrive/voice_phishing_project"
DATA_DIR    = f"{PROJECT_DIR}/data"
MODEL_OUT   = f"{PROJECT_DIR}/models/phishing_classifier"

import os
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_OUT, exist_ok=True)

print("PROJECT_DIR:", PROJECT_DIR)
print("MODEL_OUT:", MODEL_OUT)


In [None]:
# This works after drive.mount()
import shutil

total, used, free = shutil.disk_usage("/content/drive")
print(f"Total: {total // (1024**3)} GB")
print(f"Used:  {used // (1024**3)} GB")
print(f"Free:  {free // (1024**3)} GB")


In [None]:
import pandas as pd
from pathlib import Path

csv_path = Path(DATA_DIR) / "phishing_data.csv"

data = [
    ("지금 검찰청입니다. 계좌로 바로 송금하지 않으면 체포됩니다.", 1),
    ("은행 보안팀입니다. 보안카드 번호 전부 말씀해 주세요.", 1),
    ("해외에서 결제가 발생했습니다. 링크를 눌러 카드 정보를 입력하세요.", 1),
    ("택배 도착했습니다. 문 앞에 두고 갈까요?", 0),
    ("오늘 저녁 7시에 통화 가능해요?", 0),
    ("사기 예방센터입니다. 절대 계좌 비밀번호를 알려주지 마세요.", 0),
]

df = pd.DataFrame(data, columns=["text", "label"])
df.to_csv(csv_path, index=False)
df.head()


In [None]:
from datasets import load_dataset, Features, Value

features = Features({
    "text": Value("string"),
    "label": Value("int64"),
})

dataset = load_dataset(
    "csv",
    data_files=str(csv_path),
    features=features,
)

dataset = dataset["train"]
dataset = dataset.train_test_split(
    test_size=0.2,
    stratify_by_column="label"
)

train_ds = dataset["train"]
val_ds = dataset["test"]

train_ds[0], val_ds[0]


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "beomi/KcELECTRA-base"  # you can change this later

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # 0 = normal, 1 = phishing
)


In [None]:
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

train_tokenized = train_ds.map(tokenize_batch, batched=True)
val_tokenized   = val_ds.map(tokenize_batch, batched=True)

# We don't need raw text anymore for training
train_tokenized = train_tokenized.remove_columns(["text"])
val_tokenized   = val_tokenized.remove_columns(["text"])

train_tokenized.set_format("torch")
val_tokenized.set_format("torch")

train_tokenized[0]


In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir=f"{PROJECT_DIR}/runs/phishing_clf",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none",   # no W&B, no other loggers
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    processing_class=tokenizer,   # ← no more tokenizer warning
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
import torch

label_names = {0: "normal", 1: "phishing"}

@torch.no_grad()
def predict(text: str):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    )
    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
    pred_label = int(probs.argmax())
    return {
        "text": text,
        "normal_prob": float(probs[0]),
        "phishing_prob": float(probs[1]),
        "pred_label": pred_label,
        "pred_name": label_names[pred_label],
    }

test = predict("지금 계좌로 송금하지 않으면 법적 조치하겠습니다.")
test

In [None]:
metrics = trainer.evaluate()
print(metrics)

trainer.save_model(MODEL_OUT)
tokenizer.save_pretrained(MODEL_OUT)
print("Saved model to:", MODEL_OUT)
