# M4 - Subdomain & Path Depth Analysis

**Student ID**: IT24103016  
**Focus**: Count subdomains and directory depth in URL paths  
**Visualization**: Violin plot comparing subdomain count distributions  
**Input**: M3 output (IP Detection features)  
**Output**: M4 features for M5 (Scaling)

## Overview

1. Extract subdomain count and path depth features
2. Analyze URL structure complexity patterns
3. Create violin plot visualization for subdomain distribution
4. Save processed data for M5 (Normalization & Scaling)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import urlparse
import os
import warnings

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load data from M3 (IP Detection Analysis)
m3_output_path = '../results/outputs/m3_ip_features.csv'

if os.path.exists(m3_output_path):
    print("📂 Loading M3 output (IP Detection features)...")
    df = pd.read_csv(m3_output_path)
    print(f"✅ Loaded M3 data: {df.shape}")
    print(f"Existing features: {df.columns.tolist()}")
else:
    print("⚠️ M3 output not found, loading raw data...")
    df = pd.read_csv('../data/raw/phishing_site_urls.csv')
    print(f"Loaded raw data: {df.shape}")

print(f"\nDataset shape: {df.shape}")
print(f"Label distribution:\n{df['Label'].value_counts()}")
print(f"\nSample URLs:")
print(df[['URL', 'Label']].head())

## Feature Engineering: Subdomain and Path Depth Analysis

Extract subdomain count and path depth features that capture URL structure complexity.

In [None]:
def extract_structure_features(url):
    """
    Extract subdomain count and path depth from URL
    Returns: tuple (subdomain_count, path_depth, query_params_count)
    """
    try:
        # Parse URL components
        parsed = urlparse(url)
        
        # Extract hostname (netloc)
        hostname = parsed.netloc if parsed.netloc else url.split('/')[0]
        
        # Remove port number if present
        if ':' in hostname:
            hostname = hostname.split(':')[0]
        
        # Count subdomains
        # Standard domains have format: subdomain.domain.tld
        # Count dots and subtract 1 for the main domain separation
        if '.' in hostname and hostname.count('.') >= 1:
            # Handle special cases like IP addresses
            if hostname.replace('.', '').isdigit():
                subdomain_count = 0  # IP addresses don't have subdomains
            else:
                subdomain_count = max(0, hostname.count('.') - 1)
        else:
            subdomain_count = 0
            
        # Count path depth (directory levels)
        path = parsed.path if parsed.path else '/'
        # Remove leading and trailing slashes, split by slash
        path_parts = [part for part in path.strip('/').split('/') if part]
        path_depth = len(path_parts)
        
        # Count query parameters
        query = parsed.query if parsed.query else ''
        query_params_count = query.count('&') + (1 if query else 0)
        if query == '':
            query_params_count = 0
            
        # Additional structural features
        # Fragment count (after #)
        has_fragment = 1 if parsed.fragment else 0
        
        # URL segments (total parts separated by /)
        url_segments = len([part for part in url.split('/') if part])
        
        return subdomain_count, path_depth, query_params_count, has_fragment, url_segments
        
    except Exception as e:
        print(f"Error processing URL: {url[:50]}... - {e}")
        return 0, 0, 0, 0, 0

# Test the function with sample URLs
test_urls = [
    'https://www.google.com',
    'https://mail.google.com/mail/u/0/inbox',
    'http://suspicious.sub.domain.phishing-site.com/login/verify/account/update.php?id=123&token=abc',
    'https://github.com/user/repository/issues/123',
    'http://192.168.1.1/admin',
    'https://very.long.subdomain.chain.example.com/deep/path/structure/file.html#section1'
]

print("Testing structure extraction function:")
print(f"{'URL':<60} {'Subdomains':<10} {'Path Depth':<10} {'Query Params':<12} {'Fragment':<8} {'Segments':<8}")
print("-" * 110)
for url in test_urls:
    subdomain_cnt, path_d, query_cnt, fragment, segments = extract_structure_features(url)
    print(f"{url[:58]:<60} {subdomain_cnt:<10} {path_d:<10} {query_cnt:<12} {fragment:<8} {segments:<8}")

In [None]:
# Apply structure feature extraction to all URLs
print("🔧 M4 Feature Engineering: Extracting URL structure features...")

# Extract features for all URLs
structure_features = df['URL'].apply(extract_structure_features)
df['subdomain_count'] = [feat[0] for feat in structure_features]
df['path_depth'] = [feat[1] for feat in structure_features]
df['query_params_count'] = [feat[2] for feat in structure_features]
df['has_fragment'] = [feat[3] for feat in structure_features]
df['url_segments'] = [feat[4] for feat in structure_features]

print("✅ M4 Features extracted:")
m4_features = ['subdomain_count', 'path_depth', 'query_params_count', 'has_fragment', 'url_segments']
print(f"   {m4_features}")

# Display sample results
print(f"\nSample of extracted M4 features:")
sample_cols = ['URL', 'Label'] + m4_features
sample_df = df[sample_cols].head(10)
print(sample_df.to_string(index=False))

# Basic statistics
print(f"\nM4 Feature Statistics:")
print(df[m4_features].describe().round(3))

## Data Analysis & Statistical Comparison

Analyze the relationship between URL structure features and phishing labels.

In [None]:
# Statistical analysis by label
print("📊 M4 Analysis: URL Structure Features by Label")
print("="*60)

structure_analysis = df.groupby('Label')[m4_features].agg(['mean', 'std', 'median', 'max']).round(3)
print(structure_analysis)

# Calculate differences between good and bad URLs
good_urls = df[df['Label'] == 'good']
bad_urls = df[df['Label'] == 'bad']

print(f"\n🔍 Detailed Comparison (Bad - Good URLs):")
print("-" * 50)

for feature in m4_features:
    good_mean = good_urls[feature].mean()
    bad_mean = bad_urls[feature].mean()
    difference = bad_mean - good_mean
    
    print(f"{feature}:")
    print(f"  Good URLs: {good_mean:.3f} | Bad URLs: {bad_mean:.3f} | Diff: {difference:+.3f}")

# Correlation analysis
print(f"\n📈 Feature Correlations:")
correlation_matrix = df[m4_features].corr()
print(correlation_matrix.round(3))

## Visualization: Required Violin Plot

Create the main visualization: Violin plot comparing subdomain count distributions between Good/Bad URLs

In [None]:
# Create comprehensive visualization with required violin plot
fig = plt.figure(figsize=(16, 12))

# Main required visualization: Violin plot for subdomain count (takes 2 columns)
ax1 = plt.subplot(2, 3, (1, 2))
violin_data = [df[df['Label'] == 'good']['subdomain_count'], 
               df[df['Label'] == 'bad']['subdomain_count']]

# Create violin plot
parts = ax1.violinplot(violin_data, positions=[1, 2], showmeans=True, showmedians=True)

# Customize violin plot colors
colors = ['#2E8B57', '#DC143C']  # Green for good, red for bad
for i, pc in enumerate(parts['bodies']):
    pc.set_facecolor(colors[i])
    pc.set_alpha(0.7)

ax1.set_xticks([1, 2])
ax1.set_xticklabels(['Good URLs', 'Bad URLs'])
ax1.set_ylabel('Subdomain Count', fontsize=12, fontweight='bold')
ax1.set_title('Subdomain Count Distribution by Label\n(M4 Required Visualization)', 
              fontsize=14, fontweight='bold', pad=20)
ax1.grid(axis='y', alpha=0.3)

# Add statistical annotations
good_sub_mean = good_urls['subdomain_count'].mean()
bad_sub_mean = bad_urls['subdomain_count'].mean()
good_sub_median = good_urls['subdomain_count'].median()
bad_sub_median = bad_urls['subdomain_count'].median()

ax1.text(0.02, 0.98, 
         f'Good URLs:\n  Mean: {good_sub_mean:.2f}\n  Median: {good_sub_median}\n\n'
         f'Bad URLs:\n  Mean: {bad_sub_mean:.2f}\n  Median: {bad_sub_median}\n\n'
         f'Difference: {bad_sub_mean-good_sub_mean:+.2f}', 
         transform=ax1.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Path Depth Box Plot
ax2 = plt.subplot(2, 3, 3)
path_data = [df[df['Label'] == 'good']['path_depth'], 
             df[df['Label'] == 'bad']['path_depth']]
box_plot = ax2.boxplot(path_data, labels=['Good', 'Bad'], patch_artist=True)
for i, box in enumerate(box_plot['boxes']):
    box.set_facecolor(colors[i])
    box.set_alpha(0.7)

ax2.set_ylabel('Path Depth', fontweight='bold')
ax2.set_title('Path Depth Distribution', fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

# Query Parameters Distribution
ax3 = plt.subplot(2, 3, 4)
query_data = [df[df['Label'] == 'good']['query_params_count'], 
              df[df['Label'] == 'bad']['query_params_count']]
ax3.hist([query_data[0], query_data[1]], bins=range(0, min(15, max(df['query_params_count'])+2)), 
         alpha=0.7, label=['Good URLs', 'Bad URLs'], color=colors)
ax3.set_xlabel('Query Parameters Count', fontweight='bold')
ax3.set_ylabel('Frequency', fontweight='bold')
ax3.set_title('Query Parameters Distribution', fontweight='bold')
ax3.legend()
ax3.grid(axis='y', alpha=0.3)

# URL Segments Distribution  
ax4 = plt.subplot(2, 3, 5)
segments_data = [df[df['Label'] == 'good']['url_segments'], 
                 df[df['Label'] == 'bad']['url_segments']]
ax4.hist([segments_data[0], segments_data[1]], bins=range(0, min(20, max(df['url_segments'])+2)), 
         alpha=0.7, label=['Good URLs', 'Bad URLs'], color=colors)
ax4.set_xlabel('URL Segments Count', fontweight='bold')
ax4.set_ylabel('Frequency', fontweight='bold')
ax4.set_title('URL Segments Distribution', fontweight='bold')
ax4.legend()
ax4.grid(axis='y', alpha=0.3)

# Feature Correlation Heatmap
ax5 = plt.subplot(2, 3, 6)
correlation_matrix = df[m4_features].corr()
im = ax5.imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
ax5.set_xticks(range(len(m4_features)))
ax5.set_yticks(range(len(m4_features)))
ax5.set_xticklabels([f.replace('_', '\n') for f in m4_features], rotation=45, ha='right')
ax5.set_yticklabels([f.replace('_', '\n') for f in m4_features])
ax5.set_title('Feature Correlations', fontweight='bold')

# Add correlation values
for i in range(len(m4_features)):
    for j in range(len(m4_features)):
        text = ax5.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}',
                       ha="center", va="center", color="black", fontsize=8)

plt.colorbar(im, ax=ax5, shrink=0.8)

plt.suptitle('M4: Subdomain & Path Depth Analysis', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

# Print key insights
print(f"\n🔍 Key Insights from M4 Analysis:")
print(f"{'='*50}")
print(f"1. Subdomain Analysis:")
print(f"   - Good URLs average: {good_sub_mean:.2f} subdomains")
print(f"   - Bad URLs average: {bad_sub_mean:.2f} subdomains")
print(f"   - Difference: {bad_sub_mean-good_sub_mean:+.2f} subdomains")

good_path_mean = good_urls['path_depth'].mean()
bad_path_mean = bad_urls['path_depth'].mean()
print(f"\n2. Path Depth Analysis:")
print(f"   - Good URLs average: {good_path_mean:.2f} levels")
print(f"   - Bad URLs average: {bad_path_mean:.2f} levels")
print(f"   - Difference: {bad_path_mean-good_path_mean:+.2f} levels")

if bad_sub_mean > good_sub_mean:
    print(f"\n✅ Bad URLs use more subdomains - confirms complex structure pattern!")
else:
    print(f"\n⚠️  Good URLs use more subdomains - unexpected pattern!")

if bad_path_mean > good_path_mean:
    print(f"✅ Bad URLs have deeper paths - confirms complex structure pattern!")
else:
    print(f"ℹ️  Good URLs have deeper paths - legitimate sites may have complex structures")

## Feature Summary & Export

Create a summary of extracted features and save the data for M5 (Normalization & Scaling).

In [None]:
# Create M4 feature summary
m4_feature_summary = pd.DataFrame({
    'Feature': m4_features,
    'Description': [
        'Number of subdomains in the URL hostname',
        'Depth of directory structure in URL path',
        'Number of query parameters in URL',
        'Binary flag for presence of URL fragment (#)',
        'Total number of URL segments separated by /'
    ],
    'Type': ['Integer', 'Integer', 'Integer', 'Binary', 'Integer'],
    'Mean_Good': [good_urls[f].mean() for f in m4_features],
    'Mean_Bad': [bad_urls[f].mean() for f in m4_features],
    'Min_Value': [df[f].min() for f in m4_features],
    'Max_Value': [df[f].max() for f in m4_features]
})

print("M4 Feature Engineering Summary:")
print("="*80)
print(m4_feature_summary.round(3).to_string(index=False))

# Display final dataset structure
print(f"\nFinal dataset shape: {df.shape}")
print(f"New M4 features added: {m4_features}")
print(f"\nAll dataset columns: {df.columns.tolist()}")

# Feature importance by variance
print(f"\n📊 Feature Variance Analysis:")
feature_variance = df[m4_features].var().sort_values(ascending=False)
print(feature_variance.round(4))

# Save M4 results for M5 (Normalization & Scaling)
output_dir = '../results/outputs'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'm4_structure_features.csv')
df.to_csv(output_path, index=False)
print(f"\n📊 Dataset with M1+M2+M3+M4 features saved to: {output_path}")
print(f"✅ Ready for M5 (Normalization & Scaling)")

# Save M4 feature summary
summary_path = os.path.join(output_dir, 'm4_feature_summary.csv')
m4_feature_summary.to_csv(summary_path, index=False)
print(f"📋 M4 Feature summary saved to: {summary_path}")

print(f"\n🎯 M4 Analysis Complete: URL structure features (subdomains & path depth) successfully extracted")

# Quick preview of data ready for scaling
print(f"\nData preview for M5:")
numeric_cols = [col for col in df.columns if col not in ['URL', 'Label']]
print(f"Numeric features ready for scaling: {len(numeric_cols)} features")
print(f"Features: {numeric_cols}")