### Konkatenacja datasetów
[Pierwszy dataset](https://www.kaggle.com/datasets/shashwatwork/web-page-phishing-detection-dataset?resource=download)

[Drugi dataset](https://archive.ics.uci.edu/dataset/967/phiusiil+phishing+url+dataset)

In [1]:
import gdown
import os
import zipfile
import pandas as pd
from typing import Dict, List, Tuple, Any



In [2]:
def unzip_file(file_path: str) -> str:
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall()
        return zip_ref.namelist()[0]

In [3]:
def clean_up(file_paths: List[str]) -> None:
    for file_path in file_paths:
        os.remove(file_path)

In [9]:
def prepare_phish_dataset(
        target_file_path: str,
        gdown_link: str,
        columns: List[str] = ["url", "status"],
        columns_mapping: Dict[str, str] = {},
        labels_mapping: Dict[str, str] = {}
) -> None:
    gdown.download(gdown_link, output="zipped_dataset", quiet=True)
    dataset_path: str = unzip_file("zipped_dataset")
    df = pd.read_csv(dataset_path)

    if columns_mapping:
        df = df[columns].rename(columns=columns_mapping)
    if labels_mapping:
        df["label"] = df["label"].map(labels_mapping)

    if os.path.exists(target_file_path):
        df.to_csv(target_file_path, mode='a', index=False, header=False)
    else:
        df.to_csv(target_file_path, index=False)

    clean_up(["zipped_dataset", dataset_path])

In [10]:
target_file_path: str = "phish_dataset.csv"
gdown_prefix: str = "https://drive.google.com/uc?id="

In [11]:
prepare_phish_dataset(
    target_file_path,
    gdown_link=f"{gdown_prefix}1cZIVx5vabrMZnTvSo1L05gBYtJgGeE8U",
    columns=['url', 'status'],
    columns_mapping={'status': 'label'},
    labels_mapping={'legitimate': 0, 'phishing': 1}
)

In [12]:
prepare_phish_dataset(
    target_file_path,
    gdown_link=f"{gdown_prefix}1Qr2c3ax7ugQC4hvw-Au9vcl4cOEhJE_k",
    columns=['URL', 'label'],
    columns_mapping={'URL': 'url', 'label': 'label'}
)

In [13]:
df = pd.read_csv(target_file_path)
df

Unnamed: 0,url,label
0,http://www.crestonwood.com/router.php,0
1,http://shadetreetechnology.com/V4/validation/a...,1
2,https://support-appleld.com.secureupdate.duila...,1
3,http://rgipt.ac.in,0
4,http://www.iracing.com/tracks/gateway-motorspo...,0
...,...,...
247220,https://www.skincareliving.com,1
247221,https://www.winchester.gov.uk,1
247222,https://www.nononsensedesign.be,1
247223,https://patient-cell-40f5.updatedlogmylogin.wo...,0


In [14]:
! pip install -q pytorch-lightning==2.1.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/774.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/774.6 kB[0m [31m35.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/926.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import pytorch_lightning as pl

#### Data Module - One Hot

In [23]:
class PhishingOneHotDataset(Dataset):
    def __init__(self: "PhishingOneHotDataset", csv_file_path: str, max_length: int = 200) -> None:
        self.data: pd.DataFrame = pd.read_csv(csv_file_path)
        self.max_length: int = max_length
        self.alphabet: List[str] = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~:/?#[]@!$&'()*+,;") + [' ']
        self.char_to_idx: Dict[str, int] = {char: idx for idx, char in enumerate(self.alphabet)}


    def parse_url(self: "PhishingOneHotDataset", url: str) -> str:
        parsed_url = url.split("://")[-1]
        if "/" in parsed_url:
            parsed_url = parsed_url.split("/")[0]
        return parsed_url


    def pad_or_trim(self: "PhishingOneHotDataset", url: str) -> str:
        if len(url) > self.max_length:
            url = url[:self.max_length]
        else:
            url = url = url.ljust(self.max_length, ' ')
        return url


    def encode_url(self: "PhishingOneHotDataset", url: str) -> torch.Tensor:
        url = self.pad_or_trim(url)
        matrix = torch.zeros(self.max_length, len(self.alphabet))
        for i, char in enumerate(url):
            if char in self.char_to_idx:
                matrix[i, self.char_to_idx[char]] = 1
        return matrix


    def __len__(self: "PhishingOneHotDataset") -> int:
        return len(self.data)


    def __getitem__(self: "PhishingOneHotDataset", idx: int) -> Tuple[torch.Tensor, int]:
        url = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]

        parse_url = self.parse_url(url)
        url = self.encode_url(url)

        return url, label


    def get_stats(self: "PhishingOneHotDataset") -> Dict[str, Any]:
        labels = self.data["label"].unique()
        stats = [{
            "Label": label,
            "Count": len(self.data[self.data["label"] == label]),
            "Percent": round(len(self.data[self.data["label"] == label]) / len(self.data) * 100, 2)
        } for label in labels]
        return stats


    def print_stats(self: "PhishingOneHotDataset") -> None:
        stats = self.get_stats()
        for stat in stats:
            print(f"Label: {stat['Label']}, Count: {stat['Count']}, Percent: {stat['Percent']}%")

In [None]:
class PhishingDataModule(pl.LightningDataModule):
    def __init__(self: "PhishingDataModule", dataset: Dataset, batch_size: int = 32) -> None:
        super().__init__()
        self.dataset = dataset
        self.batch_size = batch_size


    def split_data(self: "PhishingDataModule", train_size: float, val_size: float) -> None:
        total_size = len(self.dataset)
        train_size = int(train_size * total_size)
        val_size = int(val_size * total_size)
        test_size = total_size - train_size - val_size
        self.train_dataset, self.val_dataset, self.test_dataset = torch.utils.data.random_split(
            self.dataset, [train_size, val_size, test_size]
        )


    def setup(self: "PhishingDataModule", train_size: float = 0.7, val_size: float = 0.1, stage=None) -> None:
        self.split_data(train_size, val_size)


    def train_dataloader(self: "PhishingDataModule") -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=1)


    def val_dataloader(self: "PhishingDataModule") -> DataLoader:
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=1)


    def test_dataloader(self: "PhishingDataModule") -> DataLoader:
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=1)

In [34]:
dataset = PhishingOneHotDataset(csv_file_path=target_file_path)
dataset.print_stats()

Label: 0, Count: 106660, Percent: 43.14%
Label: 1, Count: 140565, Percent: 56.86%


In [35]:
data_module = PhishingDataModule(dataset=dataset, batch_size=32)
data_module.setup(train_size=0.7, val_size=0.1)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# class PhisherOneHotModel(nn.Module):
#     def __init__(self: "PhisherOneHotModel", out_features: int = 1) -> None:
#         super().__init__()

#         self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=(5, 5))
#         self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=(5, 5))

#         self.fc1 = nn.Linear(in_features=12 * 47 * 18, out_features=120)
#         self.fc2 = nn.Linear(in_features=120, out_features=60)
#         self.out = nn.Linear(in_features=60, out_features=out_features)


#     def forward(self: "PhisherOneHotModel", x: torch.Tensor) -> torch.Tensor:
#         x = x.unsqueeze(1)
#         x = self.conv1(x)
#         x = F.relu(x)
#         x = F.max_pool2d(x, kernel_size=2, stride=2)

#         x = self.conv2(x)
#         x = F.relu(x)
#         x = F.max_pool2d(x, kernel_size=2, stride=2)

#         # Flatten
#         x = x.reshape(-1, 12 * 47 * 18)
#         x = self.fc1(x)
#         x = F.relu(x)
#         x = self.fc2(x)
#         x = F.relu(x)
#         x = self.out(x)
#         return x


In [None]:
import torchmetrics
import torch
import torch.nn.functional as F
import pytorch_lightning as pl



# class PhisherhModule(pl.LightningModule):
#     def __init__(self, model, optimizer, num_classes, learning_rate=1e-3):
#         super().__init__()
#         self.save_hyperparameters()
#         self.learning_rate = learning_rate
#         self.num_classes = num_classes
#         self.model = model
#         self.optimizer = optimizer

#         self.accuracy = torchmetrics.Accuracy(task="binary")
#         self.f1 = torchmetrics.F1Score(task="binary")
#         self.precision = torchmetrics.Precision(task="binary")
#         self.recall = torchmetrics.Recall(task="binary")
#         self.auroc = torchmetrics.classification.AUROC(task="binary")


#     def forward(self, x):
#         return self.model(x)


#     def compute_loss(self, x, y):
#         x = x.squeeze(dim=1) if x.ndim == 2 and x.shape[1] == 1 else x
#         return F.binary_cross_entropy_with_logits(x, y.float())


#     def common_step(self, batch, batch_idx):
#         x, y = batch
#         outputs = self(x)
#         loss = self.compute_loss(outputs,y)
#         return loss, outputs, y


#     def common_test_valid_step(self, batch, batch_idx):
#         loss, outputs, y = self.common_step(batch, batch_idx)
#         probs = torch.sigmoid(outputs.squeeze(1))
#         preds = (probs >= 0.5).int()

#         acc = self.accuracy(preds, y)
#         f1 = self.f1(preds, y)
#         precision = self.precision(preds, y)
#         recall = self.recall(preds, y)
#         auc = self.auroc(probs, y)
#         return loss, acc, f1, precision, recall, auc


#     def training_step(self, batch, batch_idx):
#         loss, acc, _, __, ___, auc = self.common_test_valid_step(batch, batch_idx)
#         self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)
#         self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True, prog_bar=True)
#         self.log('train_auc', auc, on_step=True, on_epoch=False, logger=True)
#         return loss


#     def validation_step(self, batch, batch_idx):
#         loss, acc, f1, precision, recall, auc = self.common_test_valid_step(batch, batch_idx)
#         self.log('val_loss', loss)
#         self.log('val_acc', acc)
#         self.log('val_f1', f1)
#         self.log('val_precision', precision)
#         self.log('val_recall', recall)
#         self.log('val_auc', auc)
#         return loss


#     def test_step(self, batch, batch_idx):
#         loss, acc, f1, precision, recall, auc = self.common_test_valid_step(batch, batch_idx)
#         self.log('test_loss', loss)
#         self.log('test_acc', acc)
#         self.log('test_f1', f1)
#         self.log('test_precision', precision)
#         self.log('test_recall', recall)
#         self.log('test_auc', auc)
#         return loss


#     def configure_optimizers(self):
#         return self.optimizer

In [None]:
oneHotModelLink = ''
gdown.download(oneHotModelLink, output="one-hot-model.pt", quiet=True)

In [None]:
module = torch.load("one-hot-model.pt")

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/parsing.py:198: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.


In [None]:
trainer = pl.Trainer(accelerator="auto")
trainer.test(model=module, datamodule=data_module)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type                | Params
--------------------------------------------------
0 | model     | PhisherOneHotModel  | 1.2 M 
1 | accuracy  | MulticlassAccuracy  | 0     
2 | f1        | MulticlassF1Score   | 0     
3 | precision | MulticlassPrecision | 0     
4 | recall    | MulticlassRecall    | 0     
--------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.911     Total estimated model params size (MB)
/usr/local/lib/python3.10/dist-packages/pyt

Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
