### Konkatenacja datasetów
[Pierwszy dataset](https://www.kaggle.com/datasets/shashwatwork/web-page-phishing-detection-dataset?resource=download)

[Drugi dataset](https://archive.ics.uci.edu/dataset/967/phiusiil+phishing+url+dataset)

In [1]:
import gdown
import os
import zipfile
import pandas as pd
from typing import Dict, List, Tuple, Any

In [2]:
def unzip_file(file_path: str) -> str:
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall()
        return zip_ref.namelist()[0]

In [3]:
def clean_up(file_paths: List[str]) -> None:
    for file_path in file_paths:
        os.remove(file_path)

In [4]:
def prepare_phish_dataset(
        target_file_path: str,
        gdown_link: str,
        columns: List[str] = ["url", "status"],
        columns_mapping: Dict[str, str] = {},
        labels_mapping: Dict[str, str] = {}
) -> None:
    gdown.download(gdown_link, output="zipped_dataset", quiet=True)
    dataset_path: str = unzip_file("zipped_dataset")
    df = pd.read_csv(dataset_path)

    if columns_mapping:
        df = df[columns].rename(columns=columns_mapping)
    if labels_mapping:
        df["label"] = df["label"].map(labels_mapping)

    if os.path.exists(target_file_path):
        df.to_csv(target_file_path, mode='a', index=False, header=False)
    else:
        df.to_csv(target_file_path, index=False)

    clean_up(["zipped_dataset", dataset_path])

In [5]:
target_file_path: str = "phish_dataset.csv"
gdown_prefix: str = "https://drive.google.com/uc?id="

In [6]:
prepare_phish_dataset(
    target_file_path,
    gdown_link=f"{gdown_prefix}1cZIVx5vabrMZnTvSo1L05gBYtJgGeE8U",
    columns=['url', 'status'],
    columns_mapping={'status': 'label'},
    labels_mapping={'legitimate': 0, 'phishing': 1}
)

In [7]:
prepare_phish_dataset(
    target_file_path,
    gdown_link=f"{gdown_prefix}1Qr2c3ax7ugQC4hvw-Au9vcl4cOEhJE_k",
    columns=['URL', 'label'],
    columns_mapping={'URL': 'url', 'label': 'label'}
)

In [8]:
df = pd.read_csv(target_file_path)
df

Unnamed: 0,url,label
0,http://www.crestonwood.com/router.php,0
1,http://shadetreetechnology.com/V4/validation/a...,1
2,https://support-appleld.com.secureupdate.duila...,1
3,http://rgipt.ac.in,0
4,http://www.iracing.com/tracks/gateway-motorspo...,0
...,...,...
247220,https://www.skincareliving.com,1
247221,https://www.winchester.gov.uk,1
247222,https://www.nononsensedesign.be,1
247223,https://patient-cell-40f5.updatedlogmylogin.wo...,0


In [9]:
! pip install -q pytorch-lightning==2.1.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/774.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/926.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import pytorch_lightning as pl

In [13]:
class PhishingDataModule(pl.LightningDataModule):
    def __init__(self: "PhishingDataModule", dataset: Dataset, batch_size: int = 32) -> None:
        super().__init__()
        self.dataset = dataset
        self.batch_size = batch_size


    def split_data(self: "PhishingDataModule", train_size: float, val_size: float) -> None:
        total_size = len(self.dataset)
        train_size = int(train_size * total_size)
        val_size = int(val_size * total_size)
        test_size = total_size - train_size - val_size
        self.train_dataset, self.val_dataset, self.test_dataset = torch.utils.data.random_split(
            self.dataset, [train_size, val_size, test_size]
        )


    def setup(self: "PhishingDataModule", train_size: float = 0.7, val_size: float = 0.1, stage=None) -> None:
        self.split_data(train_size, val_size)


    def train_dataloader(self: "PhishingDataModule") -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=1)


    def val_dataloader(self: "PhishingDataModule") -> DataLoader:
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=1)


    def test_dataloader(self: "PhishingDataModule") -> DataLoader:
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=1)

In [42]:
class PhishingEmbeddingDataset(Dataset):
    def __init__(self: "PhishingEmbeddingDataset", csv_file_path: str, max_length: int = 200) -> None:
        self.data: pd.DataFrame = pd.read_csv(csv_file_path)
        self.max_length: int = max_length
        self.alphabet: List[str] = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~:/?#[]@!$&'()*+,;") + [' ']
        self.char_to_idx: Dict[str, int] = {char: idx for idx, char in enumerate(self.alphabet, start=1)}  # Start indexing from 1
        self.pad_idx = 0  # Padding index for unused positions

    def parse_url(self: "PhishingEmbeddingDataset", url: str) -> str:
        parsed_url = url.split("://")[-1]
        if "/" in parsed_url:
            parsed_url = parsed_url.split("/")[0]
        return parsed_url

    def pad_or_trim(self: "PhishingEmbeddingDataset", url: str) -> List[int]:
        url_indices = [self.char_to_idx.get(char, self.pad_idx) for char in url]
        if len(url_indices) > self.max_length:
            url_indices = url_indices[:self.max_length]
        else:
            url_indices.extend([self.pad_idx] * (self.max_length - len(url_indices)))
        return url_indices

    def __len__(self: "PhishingEmbeddingDataset") -> int:
        return len(self.data)

    def __getitem__(self: "PhishingEmbeddingDataset", idx: int) -> Tuple[torch.Tensor, int]:
        url = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]

        parse_url = self.parse_url(url)
        encoded_url = self.pad_or_trim(parse_url)

        return torch.tensor(encoded_url, dtype=torch.long), label


    def get_stats(self: "PhishingEmbeddingDataset") -> Dict[str, Any]:
        labels = self.data["label"].unique()
        stats = [{
            "Label": label,
            "Count": len(self.data[self.data["label"] == label]),
            "Percent": round(len(self.data[self.data["label"] == label]) / len(self.data) * 100, 2)
        } for label in labels]
        return stats


    def print_stats(self: "PhishingEmbeddingDataset") -> None:
        stats = self.get_stats()
        for stat in stats:
            print(f"Label: {stat['Label']}, Count: {stat['Count']}, Percent: {stat['Percent']}%")

In [43]:
dataset = PhishingEmbeddingDataset(csv_file_path=target_file_path)
dataset.print_stats()

Label: 0, Count: 106660, Percent: 43.14%
Label: 1, Count: 140565, Percent: 56.86%


In [44]:
data_module = PhishingDataModule(dataset=dataset, batch_size=32)
data_module.setup(train_size=0.7, val_size=0.1)

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [68]:
class PhisherEmbeddingModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, out_features: int = 2):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=(5, 1))
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=(5, 1))
        self.fc1 = nn.Linear(in_features=12 * 47 * 100, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=out_features)


    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)

        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=(2, 1), stride=(2, 1))  # Adjust pooling

        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=(2, 1), stride=(2, 1))  # Adjust pooling

        x = x.reshape(-1, 12 * 47 * 100)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.out(x)
        return x


In [62]:
import torchmetrics
import torch
import torch.nn.functional as F
import pytorch_lightning as pl



class PhisherhModule(pl.LightningModule):
    def __init__(self, model, optimizer, num_classes, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.learning_rate = learning_rate
        self.num_classes = num_classes
        self.model = model
        self.optimizer = optimizer

        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=self.num_classes)
        self.f1 = torchmetrics.F1Score(task="multiclass", num_classes=self.num_classes)
        self.precision = torchmetrics.Precision(task="multiclass", num_classes=self.num_classes)
        self.recall = torchmetrics.Recall(task="multiclass", num_classes=self.num_classes)


    def forward(self, x):
        return self.model(x)


    def compute_loss(self, x, y):
        return F.cross_entropy(x, y)


    def common_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x)
        loss = self.compute_loss(outputs,y)
        return loss, outputs, y


    def common_test_valid_step(self, batch, batch_idx):
        loss, outputs, y = self.common_step(batch, batch_idx)
        preds = torch.argmax(outputs, dim=1)

        acc = self.accuracy(preds, y)
        f1 = self.f1(preds, y)
        precision = self.precision(preds, y)
        recall = self.recall(preds, y)
        return loss, acc, f1, precision, recall


    def training_step(self, batch, batch_idx):
        loss, acc, _, __, ___ = self.common_test_valid_step(batch, batch_idx)
        self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss, acc, f1, precision, recall = self.common_test_valid_step(batch, batch_idx)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        self.log('val_f1', f1, prog_bar=True)
        self.log('val_precision', precision, prog_bar=True)
        self.log('val_recall', recall, prog_bar=True)
        return loss


    def test_step(self, batch, batch_idx):
        loss, acc, f1, precision, recall = self.common_test_valid_step(batch, batch_idx)
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)
        self.log('test_f1', f1, prog_bar=True)
        self.log('test_precision', precision, prog_bar=True)
        self.log('test_recall', recall, prog_bar=True)
        return loss


    def configure_optimizers(self):
        return self.optimizer

In [86]:
model = PhisherEmbeddingModel(vocab_size=84, embedding_dim=100) # 84 length of alphabet
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
module = PhisherhModule(model, optimizer, num_classes=2)

In [None]:
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
import wandb

In [None]:
class PhishingPredictionLogger(Callback):
    def __init__(self, val_samples, num_sumples=32):
        super().__init__()
        self.val_samples = val_samples
        self.val_url, self.val_labels = val_samples
    
    def on_validation_epoch_end(self, trainer, pl_module):
        # Bring the tensors to CPU
        val_url = self.val_url.to(device=pl_module.device)
        val_labels = self.val_labels.to(device=pl_module.device)
        # Get model prediction
        logits = pl_module(val_url)
        preds = torch.argmax(logits, -1)

        trainer.logger.experiment.log({
            "examples": [
                wandb.Table(columns=["Url", "Prediction", "Label"], 
                            data=[(x.cpu().numpy(), pred.item(), label.item()) 
                                  for x, pred, label in zip(val_url[:self.num_samples], 
                                                            preds[:self.num_samples], 
                                                            val_labels[:self.num_samples])])
            ]
        })

In [None]:
!wandb login --relogin

In [None]:
early_stop_callback = EarlyStopping(
   monitor='val_loss',
   patience=3,
   verbose=False,
   mode='min'
)

MODEL_CKPT_PATH = 'model/'
MODEL_CKPT = 'model-{epoch:02d}-{val_loss:.2f}'

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath=MODEL_CKPT_PATH,
    filename=MODEL_CKPT,
    save_top_k=3,
    mode='min')

wandb_logger = WandbLogger(project='phisher', job_type='train')

In [None]:
trainer = pl.Trainer(check_val_every_n_epoch = 2, num_sanity_val_steps=0, accelerator="auto", max_epochs=50, devices=1, logger=wandb_logger, callbacks=[early_stop_callback, checkpoint_callback])
trainer.fit(model=module, datamodule=data_module)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:630: Checkpoint directory model/ exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type                  | Params
----------------------------------------------------
0 | model     | PhisherEmbeddingModel | 6.8 M 
1 | accuracy  | MulticlassAccuracy    | 0     
2 | f1        | MulticlassF1Score     | 0     
3 | precision | MulticlassPrecision   | 0     
4 | recall    | MulticlassRecall      | 0     
----------------------------------------------------
6.8 M     T

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [88]:
trainer.validate(datamodule=data_module)

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at model/model-epoch=01-val_loss=0.33.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at model/model-epoch=01-val_loss=0.33.ckpt


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 0.30742770433425903,
  'val_acc': 0.8805112838745117,
  'val_f1': 0.8805112838745117,
  'val_precision': 0.8805112838745117,
  'val_recall': 0.8805112838745117}]

In [89]:
trainer.test(datamodule=data_module)


INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at model/model-epoch=01-val_loss=0.33.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at model/model-epoch=01-val_loss=0.33.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.3020949065685272,
  'test_acc': 0.8820734024047852,
  'test_f1': 0.8820734024047852,
  'test_precision': 0.8820734024047852,
  'test_recall': 0.8820734024047852}]