In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import kagglehub
import os
# Download latest version
folder_path = kagglehub.dataset_download("cheedcheed/top1m")

print("Path to dataset files:", folder_path)
csv_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.csv')]

if not csv_files:
    raise FileNotFoundError("No CSV files found in the folder!")

# read the first CSV file
file_path = os.path.join(folder_path, csv_files[0])
alexa_top_1m_domain = pd.read_csv(file_path,header=None,names=['rank', 'domain'])
alexa_domains_set = set(alexa_top_1m_domain['domain'].apply(str.lower))

Path to dataset files: C:\Users\rrpra\.cache\kagglehub\datasets\cheedcheed\top1m\versions\1


In [3]:
from datasets import load_dataset
import pandas as pd

# Load the dataset from Hugging Face Hub
train_dataset = load_dataset("kmack/Phishing_urls", split="train")
test_dataset = load_dataset("kmack/Phishing_urls", split="test")
valid_dataset = load_dataset("kmack/Phishing_urls", split="valid")

# Convert to pandas DataFrame
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
valid_df = valid_dataset.to_pandas()

print(train_df.head())


                                                text  label
0             xenophongroup.com/montjoie/compgns.htm      0
1    www.azzali.eu/&usg=AOvVaw2phVSb_ENMrkATGNx5LQ0l      1
2                     guildmusic.edu.au/js/index.htm      1
3  memo.unexpectedrunner.com/ezxgytw4et\nholotili...      1
4  en.wikipedia.org/wiki/Category:American_televi...      0


In [4]:
import tldextract
df = pd.concat((train_df, valid_df, test_df))
def extrtact_tld(url):

    ext = tldextract.extract(url)
    # ext = ExtractResult(subdomain, domain, suffix)
    tld = ext.suffix    # "com", "co.uk", "org"
    return tld if tld else 'unknown'

df['tld'] = df['text'].apply(extrtact_tld)
df.head()

Unnamed: 0,text,label,tld
0,xenophongroup.com/montjoie/compgns.htm,0,com
1,www.azzali.eu/&usg=AOvVaw2phVSb_ENMrkATGNx5LQ0l,1,eu
2,guildmusic.edu.au/js/index.htm,1,edu.au
3,memo.unexpectedrunner.com/ezxgytw4et\nholotili...,1,com
4,en.wikipedia.org/wiki/Category:American_televi...,0,org


In [5]:
# Count phishing and legitimate occurrences for each TLD
tld_stats = df.groupby(['tld', 'label']).size().unstack(fill_value=0)
tld_stats.columns = ['legit_count', 'phish_count']

# Add totals and phishing rate
tld_stats['total'] = tld_stats['legit_count'] + tld_stats['phish_count']
tld_stats['phish_ratio'] = tld_stats['phish_count'] / tld_stats['total']
tld_stats = tld_stats.sort_values('phish_ratio', ascending=False)

print(tld_stats)

                    legit_count  phish_count  total  phish_ratio
tld                                                             
Tokyo                         0            1      1          1.0
go.gov.br                     0            3      3          1.0
goog                          0           13     13          1.0
gop.pk                        0            1      1          1.0
gov.la                        0            3      3          1.0
...                         ...          ...    ...          ...
kagoshima.jp                  1            0      1          0.0
kanagawa.jp                   1            0      1          0.0
kasaoka.okayama.jp            1            0      1          0.0
katowice.pl                   1            0      1          0.0
ye                            1            0      1          0.0

[1437 rows x 4 columns]


In [6]:
import pandas as pd
import re
import math
from urllib.parse import urlparse
import tldextract
from rapidfuzz import process, fuzz
# --- Helper function: Shannon entropy ---
def safe_parse(url: str):
    """Safely parse URLs, adding http:// if missing and handling bad IPv6 parts."""
    if not isinstance(url, str) or not url.strip():
        return urlparse("http://")  

    # Ensure scheme exists
    if not re.match(r'^[a-zA-Z]+://', url):
        url = 'http://' + url

    # Clean invalid brackets that trigger IPv6 errors
    url = re.sub(r'\[.*?\]', '', url)

    try:
        return urlparse(url)
    except ValueError:
        # fallback: strip more aggressively if still malformed
        url = re.sub(r'[^a-zA-Z0-9:/._\-?&=]', '', url)
        return urlparse(url)
def calculate_entropy(string):
    """Measures randomness of characters in the URL."""
    if not string:
        return 0
    freq = {char: string.count(char) for char in set(string)}
    entropy = -sum((count / len(string)) * math.log2(count / len(string)) for count in freq.values())
    return entropy

# --- Main feature extraction function ---
def extract_features(url):
    features = {}
    if not re.match(r'^[hH]+[tT]+[tT]+[pP]+[sS]+://', url):
        url = 'http://' + url
    parsed = safe_parse(url)
    
    # 1️⃣ Basic structural features
    features['url_length'] = len(url)
    features['hostname_length'] = len(parsed.netloc)
    features['path_length'] = len(parsed.path)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_letters'] = sum(c.isalpha() for c in url)
    features['num_params'] = url.count('?')
    features['num_equals'] = url.count('=')
    features['num_slashes'] = url.count('/')
    features['num_at'] = url.count('@')

    # 2️⃣ Lexical / composition cues
    features['has_https'] = 1 if url.lower().startswith('https') else 0
    features['has_ip'] = 1 if re.search(r'(\d{1,3}\.){3}\d{1,3}', parsed.netloc) else 0
    features['has_subdomain'] = 1 if parsed.netloc.count('.') > 1 else 0
    features['has_suspicious_words'] = 1 if re.search(r'(login|secure|verify|update|free|bank|click)', url.lower()) else 0

    # 3️⃣ Domain / TLD features
    extracted = tldextract.extract(url)
    main_domain = f"{extracted.domain}.{extracted.suffix}"
    if ':' in main_domain:  # remove port
        main_domain = main_domain.split(':')[0]
    features['domain_length'] = len(main_domain)
    features['in_alexa_top1m'] = 1 if main_domain in alexa_domains_set else 0
    '''
    if features['in_alexa_top1m'] == 0 and main_domain:  # only check if domain not in top1M
        # find closest match in Alexa domains
        best_match, score, _ = process.extractOne(main_domain, alexa_domains_set, scorer=fuzz.ratio)
        features['closest_alexa_domain'] = best_match
        features['closest_alexa_score'] = score  # 0-100
    else:
        features['closest_alexa_score'] = 1000  # high score to show that it is original url
    '''
    ext = tldextract.extract(url)
    tld = ext.suffix    # "com", "co.uk", "org"
    features['tld'] = tld if tld else 'unknown'
    features['tld_phish_ratio'] = tld_stats['phish_ratio'][features['tld']]  if tld_stats['phish_ratio'][features['tld']]  else 0.5
    features['tld_total_frequency'] = tld_stats['total'][features['tld']] if tld_stats['total'][features['tld']] else 1

    # 4️⃣ Ratios
    features['digit_ratio'] = features['num_digits'] / (features['url_length'] + 1e-5)
    features['special_char_ratio'] = (features['num_hyphens'] + features['num_dots'] + features['num_slashes']) / (features['url_length'] + 1e-5)

    # 5️⃣ Entropy (measures randomness / obfuscation)
    features['url_entropy'] = calculate_entropy(url)

    # 6️⃣ Misplacement indicators
    # '@' symbol used to hide real domain (like "http://evil.com@legit.com")
    features['at_in_domain'] = 1 if '@' in parsed.netloc else 0
    
    # Double slashes '//' appearing after path (used to trick users)
    features['double_slash_in_path'] = 1 if re.search(r'/.+//', parsed.path) else 0

    return features

In [7]:
from bs4 import XMLParsedAsHTMLWarning
import warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)


In [8]:
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import socket

def extract_additional_features(url):
    features = {}
    features['url'] = url # to merger with original dataframe using map
    # --- HTML content (download page) ---
    try:
        response = requests.get(url, timeout=5)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
    except:
        html = ''
        soup = None
        return features

    # 1. ServerFormHandler
    forms = soup.find_all('form') if soup else []
    features['ServerFormHandler'] = 1
    for form in forms:
        action = form.get('action')
        if action and (action.startswith('mailto:') or action.strip() == "" or not url.split('/')[2] in action):
            features['ServerFormHandler'] = 0  # suspicious
            break

    # 2. InfoEmail
    features['InfoEmail'] = 1 if re.search(r'mailto:', html) else 0

    # 4. WebsiteForwarding
    try:
        r = requests.get(url, timeout=5, allow_redirects=True)
        features['WebsiteForwarding'] = 1 if len(r.history) <= 2 else 0
    except:
        features['WebsiteForwarding'] = 0


    # 6. DisableRightClick
    features['DisableRightClick'] = 1 if re.search(r'oncontextmenu\s*=\s*"return false"', html, re.IGNORECASE) else 0

    # 7. UsingPopupWindow
    features['UsingPopupWindow'] = 1 if re.search(r'window\.open', html, re.IGNORECASE) else 0

    # 8. IframeRedirection
    features['IframeRedirection'] = 1 if '<iframe' in html.lower() else 0



    # 10. DNSRecording
    try:
        socket.gethostbyname(url.split('/')[2])
        features['DNSRecording'] = 1
    except:
        features['DNSRecording'] = 0


    

    # 14. LinksPointingToPage
    features['LinksPointingToPage'] = len(re.findall(r'<a href=', html)) if html else 0

    return features


In [9]:
from tqdm import tqdm
tqdm.pandas()
df = train_df.loc[:10000,['text']]
df['url'] = df['text']
df.drop('text',axis=1,inplace=True)
df = df.assign(**df.url.progress_apply(lambda url : pd.Series(extract_features(url))))
df.head()

100%|██████████| 10001/10001 [00:02<00:00, 4628.21it/s]


Unnamed: 0,url,url_length,hostname_length,path_length,num_dots,num_hyphens,num_digits,num_letters,num_params,num_equals,...,domain_length,in_alexa_top1m,tld,tld_phish_ratio,tld_total_frequency,digit_ratio,special_char_ratio,url_entropy,at_in_domain,double_slash_in_path
0,xenophongroup.com/montjoie/compgns.htm,45,17,21,2,0,0,38,0,0,...,17,0,com,0.396303,373979,0.0,0.133333,3.882823,0,0
1,www.azzali.eu/&usg=AOvVaw2phVSb_ENMrkATGNx5LQ0l,54,13,34,2,0,3,42,0,1,...,9,0,eu,0.520258,1703,0.055556,0.092593,5.060262,0,0
2,guildmusic.edu.au/js/index.htm,37,17,13,3,0,0,29,0,0,...,17,0,edu.au,0.450633,395,0.0,0.189189,4.046763,0,0
3,memo.unexpectedrunner.com/ezxgytw4et\nholotili...,99,25,67,5,0,1,86,0,0,...,20,0,com,0.396303,373979,0.010101,0.10101,4.377841,0,0
4,en.wikipedia.org/wiki/Category:American_televi...,63,16,40,2,0,0,53,0,0,...,13,1,org,0.375147,59502,0.0,0.095238,4.40492,0,0


In [10]:

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

urls = df['url'].tolist()
results = []

with ThreadPoolExecutor(max_workers=1000) as executor:  # try 20–50 threads
    futures = {executor.submit(extract_additional_features, url): url for url in urls}
    for future in tqdm(as_completed(futures), total=len(urls), desc="Extracting features"):
        try:
            results.append(future.result())
        except Exception as e:
            results.append({})

Extracting features: 100%|██████████| 10001/10001 [00:18<00:00, 533.32it/s] 


In [11]:
features_df = pd.DataFrame(results)
features_df = features_df.drop_duplicates(subset='url').set_index('url')

df = df.assign(**df.url.progress_apply(lambda url : features_df.loc[url]))
df.head()

100%|██████████| 10001/10001 [00:00<00:00, 11116.56it/s]


Unnamed: 0,url,url_length,hostname_length,path_length,num_dots,num_hyphens,num_digits,num_letters,num_params,num_equals,...,at_in_domain,double_slash_in_path,ServerFormHandler,InfoEmail,WebsiteForwarding,DisableRightClick,UsingPopupWindow,IframeRedirection,DNSRecording,LinksPointingToPage
0,xenophongroup.com/montjoie/compgns.htm,45,17,21,2,0,0,38,0,0,...,0,0,,,,,,,,
1,www.azzali.eu/&usg=AOvVaw2phVSb_ENMrkATGNx5LQ0l,54,13,34,2,0,3,42,0,1,...,0,0,,,,,,,,
2,guildmusic.edu.au/js/index.htm,37,17,13,3,0,0,29,0,0,...,0,0,,,,,,,,
3,memo.unexpectedrunner.com/ezxgytw4et\nholotili...,99,25,67,5,0,1,86,0,0,...,0,0,,,,,,,,
4,en.wikipedia.org/wiki/Category:American_televi...,63,16,40,2,0,0,53,0,0,...,0,0,,,,,,,,


In [12]:
import pandas as pd

def df_summarizer(df: pd.DataFrame, max_unique_values: int = 10):
    """
    Summarize a pandas DataFrame with info about:
    - Shape
    - Column types
    - Missing values
    - Unique values
    - Basic statistics for numeric columns
    - Sample values for categorical columns
    
    Args:
        df (pd.DataFrame): DataFrame to summarize
        max_unique_values (int): Max number of unique values to display for categorical columns
    
    Returns:
        pd.DataFrame: Summary table
    """
    summary = []
    for col in df.columns:
        col_type = df[col].dtype
        n_missing = df[col].isna().sum()
        n_unique = df[col].nunique(dropna=False)
        unique_vals = df[col].dropna().unique()
        
        # For categorical columns, show sample of unique values
        if col_type == object or n_unique <= max_unique_values:
            sample_values = unique_vals[:max_unique_values]
            sample_str = ', '.join(map(str, sample_values))
            if n_unique > max_unique_values:
                sample_str += ", ..."
        else:
            sample_str = ""
        
        # Basic statistics for numeric columns
        if pd.api.types.is_numeric_dtype(df[col]):
            stats = df[col].describe().to_dict()
        else:
            stats = {}
        
        summary.append({
            "column": col,
            "dtype": col_type,
            "n_missing": n_missing,
            "missing_pct": n_missing / len(df) * 100,
            "n_unique": n_unique,
            "sample_values": sample_str,
            "stats": stats
        })
    
    return pd.DataFrame(summary)

# ===================== Example usage =====================
# df_summary = df_summarizer(df)
# pd.set_option('display.max_colwidth', None)
# print(df_summary)


In [13]:
df_summary = df_summarizer(df)
pd.set_option('display.max_colwidth', None)
print(df_summary)

                  column    dtype  n_missing  missing_pct  n_unique  \
0                    url   object          0     0.000000      9992   
1             url_length    int64          0     0.000000       297   
2        hostname_length    int64          0     0.000000       114   
3            path_length    int64          0     0.000000       187   
4               num_dots    int64          0     0.000000        20   
5            num_hyphens    int64          0     0.000000        26   
6             num_digits    int64          0     0.000000       127   
7            num_letters    int64          0     0.000000       237   
8             num_params    int64          0     0.000000         5   
9             num_equals    int64          0     0.000000        16   
10           num_slashes    int64          0     0.000000        21   
11                num_at    int64          0     0.000000         3   
12             has_https    int64          0     0.000000         2   
13    

In [14]:

from tqdm import tqdm
tqdm.pandas()
all_df = [train_df, test_df,valid_df]
for i, df in enumerate(all_df):
    df['url'] = df['text']
    df.drop('text',axis=1,inplace=True)
    all_df[i] = df.assign(**df.url.progress_apply(lambda url : pd.Series(extract_features(url))))
train_df, test_df,valid_df = all_df

100%|██████████| 567056/567056 [02:02<00:00, 4610.77it/s]
100%|██████████| 70882/70882 [00:13<00:00, 5121.64it/s]
100%|██████████| 70882/70882 [00:16<00:00, 4259.59it/s]


In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

tqdm.pandas()
all_df =  [train_df, test_df,valid_df] 
for i, df in enumerate(all_df):
    urls = df['url'].tolist()
    results = []

    with ThreadPoolExecutor(max_workers=500) as executor:  # try 20–50 threads
        futures = {executor.submit(extract_additional_features, url): url for url in urls}
        for future in tqdm(as_completed(futures), total=len(urls), desc="Extracting features"):
            try:
                results.append(future.result())
            except Exception as e:
                results.append({})

    features_df = pd.DataFrame(results)
    features_df = features_df.drop_duplicates(subset='url').set_index('url')
    df = df.assign(**df.url.progress_apply(lambda url : features_df.loc[url]))
    all_df[i] = df
valid_df = all_df
train_df, test_df,valid_df = all_df

Extracting features: 100%|█████████▉| 567054/567056 [18:55<00:00,  8.19it/s]    

In [None]:
def dataframe_summary(df):
    summary = pd.DataFrame({
        "DataType": df.dtypes,
        "Non-Null Count": df.count(),
        "Missing Values": df.isnull().sum(),
        "Unique Values": df.nunique(),
        "First Value": df.iloc[0],
        "Sample Values": df.apply(lambda x: x.unique()[:5])  # first 5 unique values
    })
    return summary

# Example usage
summary_table = dataframe_summary(train_df)
print(summary_table)

                     DataType  Non-Null Count  Missing Values  Unique Values  \
text                   object          567056               0         535838   
label                   int64          567056               0              2   
tld                    object          567056               0           1341   
tld_phish_ratio       float64          567056               0            348   
tld_total_frequency     int64          567056               0            252   
url_length              int64          567056               0            704   
hostname_length         int64          567056               0            243   
path_length             int64          567056               0            405   
num_dots                int64          567056               0             37   
num_hyphens             int64          567056               0             47   
num_digits              int64          567056               0            269   
num_letters             int64          5

In [None]:
train_df.to_csv(r'Dataset\url_train_df.csv',index=False)
test_df.to_csv(r'Dataset\url_test_df.csv',index=False)
valid_df.to_csv(r'Dataset\url_validation_df.csv',index=False)