# Feature Engineering for Malicious URL Detection

In [5]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re
import ipaddress

In [6]:
# Load cleaned data
df = pd.read_csv('../cleaned_urls.csv')
print("Data shape:", df.shape)
print(df.head())

Data shape: (491876, 2)
                                                 url  label
0   https://www.wrotniak.net/photo/stylus/index.html      0
1  https://www.rottentomatoes.com/m/1036052-come_...      0
2  https://www.losangeles.craigslist.org/sgv/pts/...      0
3  https://www.moviestrailer.org/daydreamer-movie...      0
4                 https://www.tehraninteractive.com/      0


In [7]:
# Feature extraction functions
def extract_features(url):
    features = {}
    
    # Basic length features
    features['url_length'] = len(url)
    
    # Count digits
    features['num_digits'] = sum(c.isdigit() for c in url)
    
    # Count special characters
    special_chars = ['@', '?', '-', '_', '.', '/', '=', '&', '%', '+', '$', '#', '!', '*', '(', ')', '[', ']', '{', '}', '|', '\\', ':', ';', '"', "'", '<', '>', ',']
    features['num_special'] = sum(url.count(char) for char in special_chars)
    
    # Has IP
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        ipaddress.ip_address(domain)
        features['has_ip'] = 1
    except:
        features['has_ip'] = 0
    
    # Path length
    features['path_length'] = len(parsed.path)
    
    # Domain length
    features['domain_length'] = len(domain)
    
    # Number of subdomains
    features['num_subdomains'] = domain.count('.') - 1 if domain else 0
    
    # Has suspicious words
    suspicious_words = ['login', 'verify', 'secure', 'account', 'update', 'bank', 'paypal', 'free', 'win', 'password']
    features['has_suspicious_words'] = int(any(word in url.lower() for word in suspicious_words))
    
    # Entropy (measure of randomness)
    def entropy(s):
        from collections import Counter
        p, lns = Counter(s), float(len(s))
        return -sum(count/lns * np.log2(count/lns) for count in p.values())
    features['entropy'] = entropy(url)
    
    return features

# Apply to dataframe
features_df = df['url'].apply(extract_features).apply(pd.Series)
df_features = pd.concat([df, features_df], axis=1)

print("Features shape:", df_features.shape)
print(df_features.head())

Features shape: (491876, 11)
                                                 url  label  url_length  \
0   https://www.wrotniak.net/photo/stylus/index.html      0        48.0   
1  https://www.rottentomatoes.com/m/1036052-come_...      0        54.0   
2  https://www.losangeles.craigslist.org/sgv/pts/...      0        61.0   
3  https://www.moviestrailer.org/daydreamer-movie...      0        59.0   
4                 https://www.tehraninteractive.com/      0        34.0   

   num_digits  num_special  has_ip  path_length  domain_length  \
0         0.0          9.0     0.0         24.0           16.0   
1         7.0         11.0     0.0         24.0           22.0   
2        10.0         10.0     0.0         24.0           29.0   
3         0.0          9.0     0.0         30.0           21.0   
4         0.0          6.0     0.0          1.0           25.0   

   num_subdomains  has_suspicious_words   entropy  
0             1.0                   0.0  4.105055  
1             1.0  

In [8]:
# Save features
df_features.to_csv('../url_features.csv', index=False)
print("Features saved.")

Features saved.


In [None]:
# Load cleaned data
df = pd.read_csv('../cleaned_urls.csv')
print("Data shape:", df.shape)
print("Label distribution:", df['label'].value_counts())