# **Phishing Detection by Analysis on Raw URLs using CNN**

**Step 1: Download Dataset**  

- Download and move the dataset to the data directory 
- And then load the dataset using pandas

**Step 2: Preprocessing the Dataset**
- Remove any duplicates
- Encode URLs as character sequences

**Step 3: Feature Engineering**
- Tokenize the URL at character level
- Pad/truncate to a fixed length (say 200 chars)
- One-hot or integer encode sequences

**Step 4: CNN Model training**
- Create necessary files - model.py, train.py, evaluate.py
- Train the CNN model with the preprocessed Dataset
- Evaluate the model

**Step 5: Experiment Tracking**
```
pip install mlflow  
```
- Model parameters
- Accuracy
- Loss curves

**Step 6: Deployment**
```
pip install fastapi uvicorn
```
which will create app.py  
Then:
```
uvicorn api.app:app --reload
```

**Step 7: Demo UI**
```
pip install gradio
```
or
```
pip install streamlit
```

**Step 8: Create Docker and CI/CD**
- Add Docker
- Add CI/CD: Github Actions
- Host demo on Hugging Face Spaces or Render

In [2]:
import pandas as pd
import numpy as np

In [18]:
df = pd.read_csv('../data/Phishing_URL_Dataset.csv')

In [19]:
df.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


In [20]:
print(df['label'].value_counts())

1    134850
0    100945
Name: label, dtype: int64


In [3]:
df = pd.read_excel('../data/data_bal - 20000.xlsx')

In [4]:
df.head()

Unnamed: 0,label,URL
0,1,http://dbs.vote-friend.com/sg?ref=anything
1,0,https://www.reynoldstransfer.com/versa-lift-fo...
2,1,https://www.halisupportservice.com/Login.php
3,0,https://www.signets.com.br/wp-includes/wlwmani...
4,1,https://docs.google.com/document/d/e/2PACX-1vT...


In [5]:
df.to_csv('data_bal-20000.csv')

In [6]:
df = pd.read_csv('../data/malicious_phish.csv')

In [7]:
df.head()

Unnamed: 0,URL,label
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [10]:
df['label'] = df['label'].replace({'defacement': 1, 'phishing': 1, 'benign': 0, 'malware': 1})

In [11]:
print(df['label'].value_counts())

0    428103
1    223088
Name: label, dtype: int64


In [12]:
df.to_csv('../data/malicious_phish.csv')

In [13]:
df = pd.read_csv('../data/phishing_site_urls.csv')

In [14]:
df.head()


Unnamed: 0,URL,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [15]:
df['label'] = df['label'].replace({'good': 0,'bad': 1})

In [16]:
print(df['label'].value_counts())

0    392924
1    156422
Name: label, dtype: int64


In [17]:
df.to_csv('../data/phishing_site_urls.csv', index=False)

In [5]:
vocab_set = set()

dataset_paths = ['../data/data_bal-20000.csv',
                 '../data/malicious_phish.csv',
                 '../data/malicious_phish_filtered.csv', 
                 '../data/phishing_site_urls.csv',
                 '../data/Phishing_URL_Dataset.csv'
                 ]


In [7]:
for path in dataset_paths:
    df = pd.read_csv(path)
    urls = df['URL'].astype(str)
    vocab_set.update(''.join(urls))

In [8]:
vocab = sorted(vocab_set)

print(f"Character Vocabulary: {vocab}")

Character Vocabulary: ['\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x

In [16]:
import re
from sklearn.model_selection import train_test_split


In [17]:
df = pd.read_csv('../data/data_bal-20000_1.csv')

In [18]:
def clean_url(url):
    url = url.lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'www\.', '', url)
    return url.strip()

In [19]:
df = df[['URL', 'label']]
df['clean_url'] = df['URL'].apply(clean_url)

In [15]:
df.shape

(20000, 3)

In [20]:
def build_vocab(urls):
    all_text = ''.join(urls)
    vocab = sorted(set(all_text))
    char2idx = {ch: idx + 1 for idx, ch in enumerate(vocab)}
    idx2char = {idx: ch for ch, idx in char2idx.items()}
    return char2idx, idx2char

def encode_url(url, char2idx, maxlen=200):
    encoded = [char2idx.get(c, 0) for c in url[:maxlen]]
    if len(encoded) < maxlen:
        encoded += [0] * (maxlen - len(encoded))
    return encoded

In [23]:
maxlen = 200
test_size = 0.2

char2idx, idx2char = build_vocab(df['clean_url'].tolist())

df['encoded_url'] = df['clean_url'].apply(lambda x: encode_url(x, char2idx, maxlen))

X_train, X_val, y_train, y_val = train_test_split(
    df['encoded_url'].tolist(), df['label'].tolist(), test_size=test_size, random_state=42
)

In [24]:
len(X_train)

16000

In [26]:
import torch.nn as nn

class PhishModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, num_classes=2, maxlen=200, lstm_hidden=256, lstm_layers=2):
        super(PhishModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=0)

        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels=embed_dim, out_channels=256, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),

            nn.Conv1d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )

        self.lstm = nn.LSTM(input_size=256, hidden_size=lstm_hidden, num_layers=lstm_layers, batch_first=True, bidirectional=True)
        
        self.fc_layers = nn.Sequential(
            nn.Linear(2 * lstm_hidden, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, num_classes)
        )
        
    def forward(self, x):
        print("=" * 50)
        print("RAW INPUT TENSOR (first 1 sample):\n", x[0])  # Show first sample in batch
        print("Input Shape:", x.shape)  # (batch, seq_len)
        
        x = self.embedding(x)
        print("After Embedding (batch, seq_len, embed_dim):", x.shape)

        x = x.permute(0, 2, 1)
        print("After Permute for Conv1D (batch, embed_dim, seq_len):", x.shape)

        x = self.conv_layers(x)
        print("After Conv Layers:", x.shape)

        x = x.permute(0, 2, 1)
        print("After Permute back (batch, seq_len, channels):", x.shape)

        lstm_out, _ = self.lstm(x)
        print("LSTM Output (batch, seq_len, hidden_size*2 if bidirectional):", lstm_out.shape)

        x = lstm_out[:, -1, :]  # take last time step
        print("LSTM Final Timestep Output (batch, hidden_dim):", x.shape)

        x = self.fc_layers(x)
        print("Final Output (batch, num_classes):", x.shape)
        print("=" * 50)

        return x


In [None]:
import torch

In [27]:

model = PhishModel(vocab_size=300)

In [28]:
dummy_input = torch.randint(0, 200, (32, 200))

In [29]:
output = model(dummy_input)

RAW INPUT TENSOR (first 1 sample):
 tensor([178,  31,  40,   5, 176, 124, 141,  71, 159, 177, 176, 106,  65,  42,
        150,  91,  50,  16,  17, 120,  14, 160,  78, 146, 187,  81,  37, 137,
         58,  15,  14,  54, 130,   7,  68,  81, 179, 176,  58, 176,  22, 141,
        131, 140, 115,  86, 152, 193, 114, 112, 113,  68,  82,  20,  72,  10,
         79,  78, 129, 182,  39, 143, 123,  11,  68,  27, 109, 155,  48,  47,
        172,  96, 141,  51,   5,  80,  26, 160,  39,  78, 133,  92, 170, 132,
         40,  33,  85,  26, 120, 152,  80,   8, 142,  93, 171,  98, 146,  35,
         59, 144,  43, 154, 185,  39, 128, 135, 157,  35,  43,  52,   9, 118,
        198,  73,  84,  20, 100, 152,  24,  69, 194,  99,  92,  70, 127,  73,
        108,  96, 155,  32,  56, 132,  83,   8,  56, 198,  70, 172,  49,  32,
         37,  16, 162, 132,  53,  98,  35, 148, 196,  25, 104,  66, 101, 136,
         18, 123, 135, 171,  84,  41, 181, 140, 145, 188, 166,  80,  34, 140,
        194, 120, 148,  18, 