In [2]:
# ============================================
# FEATURE ENGINEERING - URL FEATURE EXTRACTION WITH TF-IDF
# ============================================

import os
import re
import math
from urllib.parse import urlparse
from collections import Counter
import numpy as np
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import pickle

os.environ['HF_HOME'] = 'E:/.cache/huggingface'

print("="*60)
print("PHISHING DETECTION - FEATURE ENGINEERING")
print("="*60)

PHISHING DETECTION - FEATURE ENGINEERING


In [3]:
# ============================================
# URL FEATURE EXTRACTION FUNCTIONS
# ============================================

def extract_url_features(url):
    """
    Extract 20 features from a URL
    
    WHAT: Takes a URL string and extracts numerical features
    WHY: Machine learning models need numbers, not text
    HOW: Parse the URL and count/check various patterns
    """
    
    # Initialize feature dictionary
    features = {}
    
    try:
        # Parse URL into components
        # WHAT: urlparse breaks URL into parts (scheme, domain, path, etc.)
        # WHY: Easier to analyze individual parts
        parsed = urlparse(url)
        
        # Extract components
        scheme = parsed.scheme          # http or https
        netloc = parsed.netloc          # domain + subdomain
        path = parsed.path              # /path/to/page
        params = parsed.params          # parameters
        query = parsed.query            # ?key=value
        
        # ========================================
        # CATEGORY 1: BASIC LENGTH FEATURES (4)
        # ========================================
        
        # Feature 1: Total URL Length
        # WHAT: Count all characters in URL
        # WHY: Phishing URLs are often longer to hide intent
        # EXAMPLE: "https://bit.ly/abc" = 19 chars
        features['url_length'] = len(url)
        
        # Feature 2: Domain Length
        # WHAT: Length of domain name only (without subdomain)
        # WHY: Legit domains are short and memorable (google.com, amazon.com)
        # HOW: Split domain by dots, take last 2 parts (domain + TLD)
        domain_parts = netloc.split('.')
        if len(domain_parts) >= 2:
            domain = domain_parts[-2] + '.' + domain_parts[-1]
            features['domain_length'] = len(domain)
        else:
            features['domain_length'] = len(netloc)
        
        # Feature 3: Path Length
        # WHAT: Length of path after domain
        # WHY: Long paths like /verify/account/login/secure can be suspicious
        # EXAMPLE: "/login" = 6, "/a/b/c/d/e" = 9
        features['path_length'] = len(path)
        
        # Feature 4: TLD (Top-Level Domain) Length
        # WHAT: Length of the domain extension (.com, .online, .tk)
        # WHY: Suspicious TLDs are often longer
        # EXAMPLE: .com = 3, .online = 6
        tld = domain_parts[-1] if domain_parts else ''
        features['tld_length'] = len(tld)
        
        # ========================================
        # CATEGORY 2: CHARACTER COUNT FEATURES (7)
        # ========================================
        
        # Feature 5: Number of Dots
        # WHAT: Count all '.' in URL
        # WHY: More dots = more subdomains = potentially suspicious
        # EXAMPLE: "a.b.c.example.com" = 4 dots
        features['num_dots'] = url.count('.')
        
        # Feature 6: Number of Hyphens
        # WHAT: Count all '-' in URL
        # WHY: Phishers use hyphens to mimic brands (pay-pal instead of paypal)
        # EXAMPLE: "secure-login-verify.com" = 2 hyphens
        features['num_hyphens'] = url.count('-')
        
        # Feature 7: Number of Underscores
        # WHAT: Count all '_' in URL
        # WHY: Rare in legitimate URLs, more common in phishing
        # EXAMPLE: "fake_login_page.com" = 2 underscores
        features['num_underscores'] = url.count('_')
        
        # Feature 8: Number of Slashes
        # WHAT: Count all '/' in URL
        # WHY: Deep directory structures can hide malicious pages
        # EXAMPLE: "/a/b/c/d" = 4 slashes
        features['num_slashes'] = url.count('/')
        
        # Feature 9: Number of Question Marks
        # WHAT: Count all '?' in URL
        # WHY: Multiple question marks are unusual/suspicious
        # EXAMPLE: "?redirect=?url=" = 2 question marks
        features['num_question'] = url.count('?')
        
        # Feature 10: Number of Ampersands
        # WHAT: Count all '&' in URL
        # WHY: Many parameters can indicate tracking or redirection
        # EXAMPLE: "?a=1&b=2&c=3" = 2 ampersands
        features['num_ampersand'] = url.count('&')
        
        # Feature 11: Number of @ Symbols
        # WHAT: Count all '@' in URL
        # WHY: MAJOR RED FLAG! Used to trick users
        # EXAMPLE: "http://paypal.com@attacker.com" redirects to attacker.com
        # The @ symbol makes browser ignore everything before it
        features['num_at'] = url.count('@')
        
        # ========================================
        # CATEGORY 3: SUSPICIOUS PATTERNS (5)
        # ========================================
        
        # Feature 12: Has IP Address Instead of Domain
        # WHAT: Check if domain is an IP address (192.168.1.1)
        # WHY: Legitimate sites use domain names, not raw IPs
        # HOW: Use regex to match IP pattern
        ip_pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
        features['has_ip_address'] = 1 if re.search(ip_pattern, netloc) else 0
        
        # Feature 13: Is HTTPS (Secure)
        # WHAT: Check if URL uses HTTPS protocol
        # WHY: Most legitimate sites use HTTPS in 2024
        # HOW: Check scheme
        features['is_https'] = 1 if scheme == 'https' else 0
        
        # Feature 14: Number of Subdomains
        # WHAT: Count how many subdomains exist
        # WHY: Too many subdomains can be suspicious
        # EXAMPLE: "a.b.c.example.com" = 3 subdomains (a, b, c)
        # HOW: Count domain parts minus 2 (domain + TLD)
        features['num_subdomains'] = max(0, len(domain_parts) - 2)
        
        # Feature 15: Has Non-Standard Port
        # WHAT: Check if URL specifies a port number
        # WHY: Standard ports (80, 443) are usually omitted
        # EXAMPLE: "example.com:8080" has port, "example.com" doesn't
        # Better version
        # IPv6 address - check if there's a colon AFTER the closing bracket
        if netloc.startswith('['):
            features['has_port'] = 1 if ']:' in netloc else 0
        # Regular domain - check for colon
        else:
            features['has_port'] = 1 if ':' in netloc else 0
        
        # Feature 16: Number of Digits in Domain
        # WHAT: Count numerical digits in domain name
        # WHY: Legitimate brands rarely use numbers (except l33t sp34k phishing)
        # EXAMPLE: "paypa1.com" (1 instead of l) = typosquatting
        features['num_digits_domain'] = sum(c.isdigit() for c in netloc)
        
        # ========================================
        # CATEGORY 4: SPECIAL CHECKS (4)
        # ========================================
        
        # Feature 17: Suspicious TLD
        # WHAT: Check if TLD is commonly used for phishing
        # WHY: Free TLDs (.tk, .ml, .ga) are heavily abused
        # LIST: Based on research of phishing-prone TLDs
        suspicious_tlds = [
            'tk', 'ml', 'ga', 'cf', 'gq',  # Free Freenom TLDs
            'zip', 'review', 'country', 'kim', 'science',
            'work', 'party', 'gdn', 'link'
        ]
        features['suspicious_tld'] = 1 if tld.lower() in suspicious_tlds else 0
        
        # Feature 18: Uses Free Hosting
        # WHAT: Check if URL uses free hosting service
        # WHY: Phishers love free platforms (no cost, easy setup)
        # LIST: Common free hosting services
        free_hosting_indicators = [
            'github.io', 'gitlab.io', 'bitbucket.io',
            'netlify.app', 'vercel.app', 'web.app',
            'firebaseapp.com', 'herokuapp.com',
            'wordpress.com', 'wixsite.com', 'weebly.com',
            'blogspot.com', 'tumblr.com',
            'pages.dev', 'azurewebsites.net',
            '000webhostapp.com', 'freehosting.com',
            'googlepages.com', 'gitbook.io'
        ]
        features['free_hosting'] = 1 if any(indicator in netloc.lower() for indicator in free_hosting_indicators) else 0
        
        # Feature 19: URL Entropy (Randomness)
        # WHAT: Calculate Shannon entropy (measure of randomness)
        # WHY: Random strings (asdk2j3k.com) have high entropy
        #      Real words (amazon.com) have lower entropy
        # HOW: Calculate probability distribution of characters
        # MATH: Entropy = -Σ(p(x) * log2(p(x)))
        def calculate_entropy(text):
            if not text:
                return 0
            # Count character frequencies
            counter = Counter(text)
            length = len(text)
            # Calculate entropy
            entropy = 0
            for count in counter.values():
                probability = count / length
                if probability > 0:
                    entropy -= probability * math.log2(probability)
            return entropy
        
        features['url_entropy'] = calculate_entropy(url)
        
        # Feature 20: Has WWW Prefix
        # WHAT: Check if domain starts with 'www.'
        # WHY: Many legitimate sites use www, phishing sometimes skips it
        # NOTE: Not a strong signal alone, but useful in combination
        features['has_www'] = 1 if netloc.startswith('www.') else 0
        
    except Exception as e:
        # If URL parsing fails, return zeros for all features
        # WHAT: Handle malformed URLs gracefully
        # WHY: Some URLs might be invalid/corrupted
        print(f"Error parsing URL: {url[:50]}... Error: {str(e)}")
        features = {
            'url_length': 0, 'domain_length': 0, 'path_length': 0, 'tld_length': 0,
            'num_dots': 0, 'num_hyphens': 0, 'num_underscores': 0, 'num_slashes': 0,
            'num_question': 0, 'num_ampersand': 0, 'num_at': 0,
            'has_ip_address': 0, 'is_https': 0, 'num_subdomains': 0, 'has_port': 0,
            'num_digits_domain': 0, 'suspicious_tld': 0, 'free_hosting': 0,
            'url_entropy': 0, 'has_www': 0
        }
    
    return features

print("✅ URL feature extraction functions defined!")

✅ URL feature extraction functions defined!


In [5]:
# ============================================
# LOAD DATASET
# ============================================

print("\nLoading PhreshPhish dataset...")
dataset = load_dataset("phreshphish/phreshphish", cache_dir='E:/.cache/huggingface')

print(f"Train: {len(dataset['train']):,} samples")
print(f"Test: {len(dataset['test']):,} samples")

# ============================================
# FIT TF-IDF VECTORIZER ON TRAINING URLS
# ============================================

print("\nFitting TF-IDF vectorizer on training URLs...")

tfidf_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=500,
    min_df=5,
    max_df=0.9,
    lowercase=True
)

train_urls = [dataset['train'][i]['url'] for i in range(len(dataset['train']))]
tfidf_vectorizer.fit(train_urls)

print(f"TF-IDF vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")

# Save vectorizer for later use
vectorizer_path = '../../../models/url-detection/tfidf_vectorizer.pkl'
os.makedirs(os.path.dirname(vectorizer_path), exist_ok=True)
with open(vectorizer_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print(f"TF-IDF vectorizer saved to {vectorizer_path}")

# ============================================
# BATCH PROCESSING FUNCTION
# ============================================

def process_batch_with_tfidf(batch_data, vectorizer):
    df = pd.DataFrame(batch_data)
    
    # Extract manual URL features
    url_features_list = [extract_url_features(url) for url in df['url']]
    url_features_df = pd.DataFrame(url_features_list)
    
    # Extract TF-IDF features
    tfidf_features = vectorizer.transform(df['url'])
    
    # Combine manual features (dense) with TF-IDF (sparse)
    manual_features_sparse = csr_matrix(url_features_df.values)
    combined_features = hstack([manual_features_sparse, tfidf_features])
    
    # Create column names
    manual_cols = url_features_df.columns.tolist()
    tfidf_cols = [f'tfidf_{i}' for i in range(tfidf_features.shape[1])]
    all_cols = manual_cols + tfidf_cols
    
    # Convert to dense DataFrame
    combined_df = pd.DataFrame(combined_features.toarray(), columns=all_cols)
    combined_df['label'] = df['label'].values
    
    return combined_df

# ============================================
# PROCESS TRAIN SET
# ============================================

print("\n" + "="*60)
print("PROCESSING TRAIN SET")
print("="*60)

BATCH_SIZE = 10000
TRAIN_OUTPUT_FILE = '../../../data/processed/url-detection/phishing_features_train.csv'

num_train_batches = (len(dataset['train']) + BATCH_SIZE - 1) // BATCH_SIZE

for i in tqdm(range(num_train_batches), desc="Processing train batches"):
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, len(dataset['train']))
    
    batch = dataset['train'].select(range(start_idx, end_idx))
    batch_dict = {
        'url': batch['url'],
        'label': batch['label']
    }
    
    processed_batch = process_batch_with_tfidf(batch_dict, tfidf_vectorizer)
    
    if i == 0:
        processed_batch.to_csv(TRAIN_OUTPUT_FILE, index=False, mode='w')
    else:
        processed_batch.to_csv(TRAIN_OUTPUT_FILE, index=False, mode='a', header=False)

print(f"Train set saved to {TRAIN_OUTPUT_FILE}")

# ============================================
# PROCESS TEST SET
# ============================================

print("\n" + "="*60)
print("PROCESSING TEST SET")
print("="*60)

TEST_OUTPUT_FILE = '../../../data/processed/url-detection/phishing_features_test.csv'

num_test_batches = (len(dataset['test']) + BATCH_SIZE - 1) // BATCH_SIZE

for i in tqdm(range(num_test_batches), desc="Processing test batches"):
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, len(dataset['test']))
    
    batch = dataset['test'].select(range(start_idx, end_idx))
    batch_dict = {
        'url': batch['url'],
        'label': batch['label']
    }
    
    processed_batch = process_batch_with_tfidf(batch_dict, tfidf_vectorizer)
    
    if i == 0:
        processed_batch.to_csv(TEST_OUTPUT_FILE, index=False, mode='w')
    else:
        processed_batch.to_csv(TEST_OUTPUT_FILE, index=False, mode='a', header=False)

print(f"Test set saved to {TEST_OUTPUT_FILE}")

# ============================================
# SUMMARY
# ============================================

print("\n" + "="*60)
print("FEATURE ENGINEERING COMPLETE")
print("="*60)
print(f"Train file: {TRAIN_OUTPUT_FILE}")
print(f"  Rows: {len(dataset['train']):,}")
print(f"  Columns: {20 + 500 + 1} (20 manual + 500 TF-IDF + 1 label)")
print(f"\nTest file: {TEST_OUTPUT_FILE}")
print(f"  Rows: {len(dataset['test']):,}")
print(f"  Columns: {20 + 500 + 1}")
print(f"\nTF-IDF vectorizer: {vectorizer_path}")
print("\nReady for model training")


Loading PhreshPhish dataset...
Train: 371,941 samples
Test: 36,787 samples

Fitting TF-IDF vectorizer on training URLs...
TF-IDF vocabulary size: 500
TF-IDF vectorizer saved to ../../../models/url-detection/tfidf_vectorizer.pkl

PROCESSING TRAIN SET


Processing train batches: 100%|██████████| 38/38 [02:11<00:00,  3.47s/it]


Train set saved to ../../../data/processed/url-detection/phishing_features_train.csv

PROCESSING TEST SET


Processing test batches: 100%|██████████| 4/4 [00:13<00:00,  3.37s/it]

Test set saved to ../../../data/processed/url-detection/phishing_features_test.csv

FEATURE ENGINEERING COMPLETE
Train file: ../../../data/processed/url-detection/phishing_features_train.csv
  Rows: 371,941
  Columns: 521 (20 manual + 500 TF-IDF + 1 label)

Test file: ../../../data/processed/url-detection/phishing_features_test.csv
  Rows: 36,787
  Columns: 521

TF-IDF vectorizer: ../../../models/url-detection/tfidf_vectorizer.pkl

Ready for model training





In [4]:
# ============================================
# BATCH PROCESSING FUNCTION
# ============================================

def process_batch(batch_data):
    """
    Process a batch of data and extract all features
    
    WHAT: Takes a batch of rows, extracts features, returns DataFrame
    WHY: Can't load 408K rows at once (memory crash)
    HOW: Process 10K rows at a time
    
    Input: Dictionary with columns from dataset
    Output: DataFrame with extracted features
    """
    
    # Convert batch to DataFrame for easier manipulation
    df = pd.DataFrame(batch_data)
    
    # Extract URL features for each row
    # WHAT: Apply feature extraction to every URL
    # HOW: Use list comprehension to process all rows
    print("   Extracting URL features...")
    url_features_list = [extract_url_features(url) for url in df['url']]
    url_features_df = pd.DataFrame(url_features_list)
    
    # Keep label (target variable)
    df['label'] = df['label']
    
    # Combine all features
    # WHAT: Merge URL features with label
    # HOW: Concatenate DataFrames horizontally
    final_df = pd.concat([
        url_features_df,
        df[['label']]
    ], axis=1)
    
    return final_df

print("✅ Batch processing function defined!")

✅ Batch processing function defined!


In [5]:
# ============================================
# MAIN FEATURE ENGINEERING PROCESS
# ============================================

print("\n" + "="*60)
print("STARTING FEATURE EXTRACTION")
print("="*60)

# Load dataset
print("\n📥 Loading PhreshPhish dataset...")
dataset = load_dataset("phreshphish/phreshphish", cache_dir='E:/.cache/huggingface')

print(f"✅ Dataset loaded!")
print(f"   Train: {len(dataset['train']):,} samples")
print(f"   Test: {len(dataset['test']):,} samples")

# Define batch size
BATCH_SIZE = 10000  # Process 10K rows at a time

# Output CSV file paths (SEPARATE FILES!)
TRAIN_OUTPUT_FILE = '../../../data/processed/url-detection/phishing_features_train.csv'
TEST_OUTPUT_FILE = '../../../data/processed/url-detection/phishing_features_test.csv'

# ============================================
# PROCESS TRAIN SET
# ============================================

print("\n" + "="*60)
print("PROCESSING TRAIN SET")
print("="*60)

num_train_batches = (len(dataset['train']) + BATCH_SIZE - 1) // BATCH_SIZE

for i in tqdm(range(num_train_batches), desc="Processing train batches"):
    # Get batch indices
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, len(dataset['train']))
    
    # Select batch
    batch = dataset['train'].select(range(start_idx, end_idx))
    
    # Convert to dict (columns as keys)
    batch_dict = {
        'url': batch['url'],
        'label': batch['label']
    }
    
    # Process batch
    processed_batch = process_batch(batch_dict)
    
    # Save incrementally to TRAIN file
    if i == 0:
        # First batch: create new CSV with header
        processed_batch.to_csv(TRAIN_OUTPUT_FILE, index=False, mode='w')
    else:
        # Subsequent batches: append without header
        processed_batch.to_csv(TRAIN_OUTPUT_FILE, index=False, mode='a', header=False)

print(f"✅ Train set processed and saved to {TRAIN_OUTPUT_FILE}!")

# ============================================
# PROCESS TEST SET
# ============================================

print("\n" + "="*60)
print("PROCESSING TEST SET")
print("="*60)

num_test_batches = (len(dataset['test']) + BATCH_SIZE - 1) // BATCH_SIZE

for i in tqdm(range(num_test_batches), desc="Processing test batches"):
    # Get batch indices
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, len(dataset['test']))
    
    # Select batch
    batch = dataset['test'].select(range(start_idx, end_idx))
    
    # Convert to dict
    batch_dict = {
        'url': batch['url'],
        'label': batch['label']
    }
    
    # Process batch
    processed_batch = process_batch(batch_dict)
    
    # Save incrementally to TEST file
    if i == 0:
        # First batch: create new CSV with header
        processed_batch.to_csv(TEST_OUTPUT_FILE, index=False, mode='w')
    else:
        # Subsequent batches: append without header
        processed_batch.to_csv(TEST_OUTPUT_FILE, index=False, mode='a', header=False)

print(f"✅ Test set processed and saved to {TEST_OUTPUT_FILE}!")

# ============================================
# SUMMARY
# ============================================

print("\n" + "="*60)
print("FEATURE ENGINEERING COMPLETE!")
print("="*60)
print(f"📁 Train file: {TRAIN_OUTPUT_FILE}")
print(f"   - Rows: {len(dataset['train']):,}")
print(f"   - Columns: 21")
print(f"\n📁 Test file: {TEST_OUTPUT_FILE}")
print(f"   - Rows: {len(dataset['test']):,}")
print(f"   - Columns: 21")
print(f"\n✅ Ready for model training!")


STARTING FEATURE EXTRACTION

📥 Loading PhreshPhish dataset...
✅ Dataset loaded!
   Train: 371,941 samples
   Test: 36,787 samples

PROCESSING TRAIN SET


Processing train batches:   0%|          | 0/38 [00:00<?, ?it/s]

   Extracting URL features...


Processing train batches:   3%|▎         | 1/38 [00:00<00:28,  1.30it/s]

   Extracting URL features...


Processing train batches:   5%|▌         | 2/38 [00:01<00:25,  1.42it/s]

   Extracting URL features...


Processing train batches:   8%|▊         | 3/38 [00:02<00:23,  1.49it/s]

   Extracting URL features...


Processing train batches:  11%|█         | 4/38 [00:02<00:22,  1.54it/s]

   Extracting URL features...


Processing train batches:  13%|█▎        | 5/38 [00:03<00:21,  1.54it/s]

   Extracting URL features...


Processing train batches:  16%|█▌        | 6/38 [00:03<00:20,  1.57it/s]

   Extracting URL features...


Processing train batches:  18%|█▊        | 7/38 [00:04<00:19,  1.62it/s]

   Extracting URL features...


Processing train batches:  21%|██        | 8/38 [00:05<00:18,  1.64it/s]

   Extracting URL features...


Processing train batches:  24%|██▎       | 9/38 [00:05<00:17,  1.65it/s]

   Extracting URL features...


Processing train batches:  26%|██▋       | 10/38 [00:06<00:17,  1.62it/s]

   Extracting URL features...


Processing train batches:  29%|██▉       | 11/38 [00:06<00:16,  1.62it/s]

   Extracting URL features...


Processing train batches:  32%|███▏      | 12/38 [00:07<00:16,  1.61it/s]

   Extracting URL features...


Processing train batches:  34%|███▍      | 13/38 [00:08<00:15,  1.59it/s]

   Extracting URL features...


Processing train batches:  37%|███▋      | 14/38 [00:08<00:14,  1.60it/s]

   Extracting URL features...


Processing train batches:  39%|███▉      | 15/38 [00:09<00:14,  1.57it/s]

   Extracting URL features...


Processing train batches:  42%|████▏     | 16/38 [00:10<00:14,  1.47it/s]

   Extracting URL features...


Processing train batches:  45%|████▍     | 17/38 [00:11<00:14,  1.40it/s]

   Extracting URL features...


Processing train batches:  47%|████▋     | 18/38 [00:11<00:14,  1.42it/s]

   Extracting URL features...


Processing train batches:  50%|█████     | 19/38 [00:12<00:13,  1.42it/s]

   Extracting URL features...


Processing train batches:  53%|█████▎    | 20/38 [00:13<00:12,  1.41it/s]

   Extracting URL features...


Processing train batches:  55%|█████▌    | 21/38 [00:13<00:11,  1.47it/s]

   Extracting URL features...


Processing train batches:  58%|█████▊    | 22/38 [00:14<00:10,  1.46it/s]

   Extracting URL features...


Processing train batches:  61%|██████    | 23/38 [00:15<00:10,  1.48it/s]

   Extracting URL features...


Processing train batches:  63%|██████▎   | 24/38 [00:15<00:09,  1.51it/s]

   Extracting URL features...


Processing train batches:  66%|██████▌   | 25/38 [00:16<00:08,  1.53it/s]

   Extracting URL features...


Processing train batches:  68%|██████▊   | 26/38 [00:17<00:07,  1.55it/s]

   Extracting URL features...


Processing train batches:  71%|███████   | 27/38 [00:17<00:07,  1.57it/s]

   Extracting URL features...


Processing train batches:  74%|███████▎  | 28/38 [00:18<00:06,  1.59it/s]

   Extracting URL features...


Processing train batches:  76%|███████▋  | 29/38 [00:18<00:05,  1.59it/s]

   Extracting URL features...


Processing train batches:  79%|███████▉  | 30/38 [00:19<00:04,  1.62it/s]

   Extracting URL features...


Processing train batches:  82%|████████▏ | 31/38 [00:20<00:04,  1.63it/s]

   Extracting URL features...


Processing train batches:  84%|████████▍ | 32/38 [00:20<00:03,  1.63it/s]

   Extracting URL features...


Processing train batches:  87%|████████▋ | 33/38 [00:21<00:03,  1.63it/s]

   Extracting URL features...


Processing train batches:  89%|████████▉ | 34/38 [00:21<00:02,  1.64it/s]

   Extracting URL features...


Processing train batches:  92%|█████████▏| 35/38 [00:22<00:01,  1.65it/s]

   Extracting URL features...


Processing train batches:  95%|█████████▍| 36/38 [00:23<00:01,  1.65it/s]

   Extracting URL features...


Processing train batches: 100%|██████████| 38/38 [00:23<00:00,  1.59it/s]


   Extracting URL features...
✅ Train set processed and saved to ../../../data/processed/url-detection/phishing_features_train.csv!

PROCESSING TEST SET


Processing test batches:   0%|          | 0/4 [00:00<?, ?it/s]

   Extracting URL features...


Processing test batches:  25%|██▌       | 1/4 [00:00<00:01,  1.69it/s]

   Extracting URL features...


Processing test batches:  50%|█████     | 2/4 [00:01<00:01,  1.66it/s]

   Extracting URL features...


Processing test batches: 100%|██████████| 4/4 [00:02<00:00,  1.80it/s]

   Extracting URL features...
✅ Test set processed and saved to ../../../data/processed/url-detection/phishing_features_test.csv!

FEATURE ENGINEERING COMPLETE!
📁 Train file: ../../../data/processed/url-detection/phishing_features_train.csv
   - Rows: 371,941
   - Columns: 21

📁 Test file: ../../../data/processed/url-detection/phishing_features_test.csv
   - Rows: 36,787
   - Columns: 21

✅ Ready for model training!



