# M1 - URL Length & Hostname Length Analysis

**Student ID**: IT24103625  
**Focus**: Extract URL and hostname length features  
**Visualization**: Boxplot showing URL length distribution differences between Good/Bad URLs

## Overview

1. Extract URL length and hostname length features
2. Analyze the relationship between URL/hostname length and phishing labels
3. Create visualizations to show the discriminative power of these features
4. Save the processed data for use in the next module (M2)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import urlparse
import os
import warnings

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load the dataset
data_path = '../data/raw/phishing_site_urls.csv'
df = pd.read_csv(data_path)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nLabel distribution:")
print(df['Label'].value_counts())
print(f"\nFirst few URLs:")
print(df.head())

## Feature Engineering: URL and Hostname Length

Extract URL length and hostname length features. These are fundamental security features as phishing URLs often use longer, more complex structures to deceive users.

In [None]:
def extract_length_features(url):
    """
    Extract URL length and hostname length features
    Returns: tuple (url_length, hostname_length)
    """
    try:
        # Calculate URL length
        url_length = len(url)
        
        # Parse URL to extract hostname
        parsed = urlparse(url)
        hostname = parsed.netloc if parsed.netloc else url.split('/')[0]
        hostname_length = len(hostname)
        
        return url_length, hostname_length
    except Exception as e:
        print(f"Error processing URL: {url[:50]}... - {e}")
        return len(url), 0

# Test the function with sample URLs
test_urls = [
    'https://www.google.com',
    'http://very-long-suspicious-phishing-domain-name.fake-paypal-security-update.com/login/verify/account/details',
    'https://github.com/user/repo',
    'http://192.168.1.1/admin/login.php?redirect=dashboard&token=abc123'
]

print("Testing length extraction function:")
for url in test_urls:
    url_len, host_len = extract_length_features(url)
    print(f"{url[:60]:<60} -> URL: {url_len:3d}, Hostname: {host_len:2d}")

In [None]:
# Apply length feature extraction to all URLs
print("Extracting URL and hostname length features...")

# Extract features for all URLs
length_features = df['URL'].apply(extract_length_features)
df['url_length'] = [feat[0] for feat in length_features]
df['hostname_length'] = [feat[1] for feat in length_features]

print("Feature extraction completed!")

# Display sample results
print(f"\nSample of extracted features:")
sample_df = df[['URL', 'Label', 'url_length', 'hostname_length']].head(10)
print(sample_df.to_string(index=False))

# Basic statistics
print(f"\nFeature Statistics:")
print(f"URL Length - Min: {df['url_length'].min()}, Max: {df['url_length'].max()}, Mean: {df['url_length'].mean():.2f}")
print(f"Hostname Length - Min: {df['hostname_length'].min()}, Max: {df['hostname_length'].max()}, Mean: {df['hostname_length'].mean():.2f}")

## Data Analysis & Statistical Comparison

Analyze the relationship between URL/hostname length and phishing labels.

In [None]:
# Statistical analysis by label
length_analysis = df.groupby('Label')[['url_length', 'hostname_length']].agg(['mean', 'std', 'min', 'max', 'median']).round(2)

print("Length Analysis by Label:")
print(length_analysis)

# Calculate differences between good and bad URLs
good_urls = df[df['Label'] == 'good']
bad_urls = df[df['Label'] == 'bad']

url_len_diff = bad_urls['url_length'].mean() - good_urls['url_length'].mean()
hostname_len_diff = bad_urls['hostname_length'].mean() - good_urls['hostname_length'].mean()

print(f"\nMean Differences (Bad - Good):")
print(f"URL Length: {url_len_diff:.2f} characters")
print(f"Hostname Length: {hostname_len_diff:.2f} characters")

# Percentile analysis
print(f"\nPercentile Analysis:")
for label in ['good', 'bad']:
    subset = df[df['Label'] == label]
    print(f"\n{label.upper()} URLs:")
    print(f"  URL Length - 25th: {subset['url_length'].quantile(0.25):.0f}, 75th: {subset['url_length'].quantile(0.75):.0f}")
    print(f"  Hostname Length - 25th: {subset['hostname_length'].quantile(0.25):.0f}, 75th: {subset['hostname_length'].quantile(0.75):.0f}")

## Visualization: Required Boxplot

Create the main visualization: Boxplot showing URL length distribution differences between Good/Bad URLs

In [None]:
# Create the main visualization: Boxplot for URL length distribution (Required visualization)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Boxplot 1: URL Length Distribution by Label (Required visualization)
box_data_url = [df[df['Label'] == 'good']['url_length'], df[df['Label'] == 'bad']['url_length']]
box1 = ax1.boxplot(box_data_url, labels=['Good URLs', 'Bad URLs'], 
                   patch_artist=True, notch=True, showmeans=True)

# Customize boxplot colors
colors = ['#2E8B57', '#DC143C']  # Green for good, red for bad
for patch, color in zip(box1['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax1.set_ylabel('URL Length (characters)', fontsize=12, fontweight='bold')
ax1.set_title('URL Length Distribution by Label\n(M1 Required Visualization)', fontsize=14, fontweight='bold', pad=20)
ax1.grid(axis='y', alpha=0.3)

# Add statistical annotations
good_mean = df[df['Label'] == 'good']['url_length'].mean()
bad_mean = df[df['Label'] == 'bad']['url_length'].mean()
ax1.text(0.02, 0.98, f'Good Mean: {good_mean:.1f}\nBad Mean: {bad_mean:.1f}\nDifference: {bad_mean-good_mean:.1f}', 
         transform=ax1.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Boxplot 2: Hostname Length Distribution by Label
box_data_hostname = [df[df['Label'] == 'good']['hostname_length'], df[df['Label'] == 'bad']['hostname_length']]
box2 = ax2.boxplot(box_data_hostname, labels=['Good URLs', 'Bad URLs'], 
                   patch_artist=True, notch=True, showmeans=True)

# Customize boxplot colors
for patch, color in zip(box2['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax2.set_ylabel('Hostname Length (characters)', fontsize=12, fontweight='bold')
ax2.set_title('Hostname Length Distribution by Label', fontsize=14, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

# Add statistical annotations
good_host_mean = df[df['Label'] == 'good']['hostname_length'].mean()
bad_host_mean = df[df['Label'] == 'bad']['hostname_length'].mean()
ax2.text(0.02, 0.98, f'Good Mean: {good_host_mean:.1f}\nBad Mean: {bad_host_mean:.1f}\nDifference: {bad_host_mean-good_host_mean:.1f}', 
         transform=ax2.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))

plt.suptitle('M1: URL Length & Hostname Length Analysis', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print key insights
print(f"\nKey Insights from M1 Analysis:")
print(f"{'='*50}")
print(f"1. URL Length Analysis:")
print(f"   - Good URLs average: {good_mean:.1f} characters")
print(f"   - Bad URLs average: {bad_mean:.1f} characters")
print(f"   - Difference: {bad_mean-good_mean:.1f} characters ({((bad_mean-good_mean)/good_mean*100):+.1f}%)")

print(f"\n2. Hostname Length Analysis:")
print(f"   - Good URLs average: {good_host_mean:.1f} characters")
print(f"   - Bad URLs average: {bad_host_mean:.1f} characters")
print(f"   - Difference: {bad_host_mean-good_host_mean:.1f} characters ({((bad_host_mean-good_host_mean)/good_host_mean*100):+.1f}%)")

if bad_mean > good_mean:
    print(f"\nBad URLs are longer on average - confirms phishing pattern!")
else:
    print(f"\nGood URLs are longer on average - unexpected pattern!")

## Feature Summary & Export

Create a summary of extracted features and save the data for M2 (Character Counts Analysis).

In [None]:
# Create feature summary
feature_summary = pd.DataFrame({
    'Feature': ['url_length', 'hostname_length'],
    'Description': [
        'Total character count in the complete URL',
        'Character count in the hostname/domain portion only'
    ],
    'Type': ['Continuous', 'Continuous'],
    'Min_Value': [df['url_length'].min(), df['hostname_length'].min()],
    'Max_Value': [df['url_length'].max(), df['hostname_length'].max()],
    'Mean_Value': [df['url_length'].mean(), df['hostname_length'].mean()]
})

print("M1 Feature Engineering Summary:")
print("="*60)
print(feature_summary.to_string(index=False))

# Display final dataset structure
print(f"\nFinal dataset shape: {df.shape}")
print(f"New features added: url_length, hostname_length")
print(f"\nDataset columns: {df.columns.tolist()}")

# Save results for M2 (Character Counts Analysis)
output_dir = '../results/outputs'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'm1_url_length_features.csv')
df.to_csv(output_path, index=False)
print(f"\nDataset with M1 features saved to: {output_path}")
print(f"Ready for M2 (Character Counts Analysis)")

# Save feature summary
summary_path = os.path.join(output_dir, 'm1_feature_summary.csv')
feature_summary.to_csv(summary_path, index=False)
print(f"Feature summary saved to: {summary_path}")

print("\nM1 Analysis Complete: URL Length & Hostname Length features successfully extracted")