In [1]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from sklearn.preprocessing import StandardScaler

In [2]:
file_path = "dataset/phishing_url_website.csv" 
df = pd.read_csv(file_path)

In [None]:
df_features = df.copy()

In [3]:
df.drop_duplicates(inplace=True)

In [None]:
df_features['label'] = df_features['label'].apply(lambda x: 1 if x == 0 else 0)

In [4]:
df_features.fillna(0, inplace=True)

In [5]:
numerical_features = ['URLSimilarityIndex', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL',
                      'LineOfCode', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'NoOfImage', 
                      'NoOfJS', 'NoOfSelfRef']

In [6]:
scaler = StandardScaler()

In [7]:
df_features[numerical_features] = scaler.fit_transform(df_features[numerical_features])

In [8]:
# Feature Extraction

def has_ip_address(url):
    return 1 if re.search(r'(\d{1,3}\.){3}\d{1,3}', url) else 0

df_features['HasIPAddress'] = df_features['URL'].apply(has_ip_address)

# URL Length (Longer URLs can indicate phishing)
df_features['URLLength'] = df_features['URL'].apply(len)

# Presence of suspicious keywords
suspicious_keywords = ['login', 'secure', 'bank', 'update', 'verify', 'account', 'password']
['SuspiciousKeyword'] = df_features['URL'].apply(lambda x: any(word in x.lower() for word in suspicious_keywords)).astype(int)

# Count of digits in URL (Phishing URLs often have numbers to mimic legitimacy)
df_features['DigitCount'] = df_features['URL'].apply(lambda x: sum(c.isdigit() for c in x))

# 5️ Count of hyphens ('-') in URL (Often used to create lookalike domains)
df_features['HyphenCount'] = df_features['URL'].apply(lambda x: x.count('-'))

# 6️⃣ Count of subdomains (Phishing URLs tend to have more subdomains)
df_features['SubdomainCount'] = df_features['Domain'].apply(lambda x: urlparse(f"https://{x}").netloc.count('.'))

# 7️⃣ Check if the TLD is commonly associated with phishing
phishing_tlds = ['tk', 'ml', 'cf', 'ga', 'gq']
df_features['PhishingTLD'] = df_features['TLD'].apply(lambda x: 1 if x.lower() in phishing_tlds else 0)

# 8️⃣ Ratio of Digits to Total Length in URL (More numbers indicate phishing)
df_features['DigitToLengthRatio'] = df_features['DigitCount'] / df_features['URLLength']

# 9️⃣ Ratio of Special Characters to Total Length in URL
df_features['SpecialCharRatio'] = df_features['SpacialCharRatioInURL']

# 🔟 Normalized URL Similarity Score (Scaling for ML)
df_features['NormalizedURLSimilarity'] = (df_features['URLSimilarityIndex'] - df_features['URLSimilarityIndex'].min()) / (df_features['URLSimilarityIndex'].max() - df_features['URLSimilarityIndex'].min())



In [9]:
df_features.head()

Unnamed: 0,URL,Domain,TLD,URLSimilarityIndex,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,Title,DomainTitleMatchScore,...,HasIPAddress,URLLength,SuspiciousKeyword,DigitCount,HyphenCount,SubdomainCount,PhishingTLD,DigitToLengthRatio,SpecialCharRatio,NormalizedURLSimilarity
0,https://www.southbankmosaics.com,www.southbankmosaics.com,com,0.719359,-0.59903,-0.983406,1,-0.225603,à¸‚à¹ˆà¸²à¸§à¸ªà¸” à¸‚à¹ˆà¸²à¸§à¸§à¸±à¸™à¸™à¸µ...,-1.042697,...,0,32,1,0,0,2,0,0.0,-0.983406,1.0
1,https://www.uni-mainz.de,www.uni-mainz.de,de,0.719359,0.061922,0.912555,1,-0.177871,johannes gutenberg-universitÃ¤t mainz,0.075897,...,0,24,0,0,1,2,0,0.0,0.912555,1.0
2,https://www.voicefmradio.co.uk,www.voicefmradio.co.uk,uk,0.719359,0.061922,0.292058,1,-0.297997,voice fm southampton,-0.103078,...,0,30,0,0,0,3,0,0.0,0.292058,1.0
3,https://www.globalreporting.org,www.globalreporting.org,org,0.719359,-0.59903,-0.948934,1,0.293088,gri - home,-1.042697,...,0,31,0,0,0,2,0,0.0,-0.948934,1.0
4,https://www.nerdscandy.com,www.nerdscandy.com,com,0.719359,-0.59903,-0.70763,1,-0.260607,nerds candy,0.970771,...,0,26,0,0,0,2,0,0.0,-0.70763,1.0
