In [None]:
# Data Exploration Notebook for Heuristic Analysis
# Author: UWU/CST/21/083

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import urlparse
import re

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load your dataset
# Update this path to your actual dataset
try:
    df = pd.read_csv('data/raw/url_dataset.csv')
    print(f"Dataset loaded: {len(df)} URLs")
    print(f"Columns: {df.columns.tolist()}")
except FileNotFoundError:
    print("Dataset not found. Creating sample data...")
    # Create sample data for testing
    sample_urls = [
        "https://www.google.com/search?q=test",
        "http://192.168.1.1/login.php",
        "https://paypa1-login.secure.site.xyz/verify",
        "http://bit.ly/3abc123",
        "https://github.com/microsoft/vscode",
        "http://free-gift.reward.top/get",
        "https://my-bank-update.com/login/secure",
        "http://legitimate-site.com/about"
    ]
    
    df = pd.DataFrame({'url': sample_urls, 'label': [0, 1, 1, 1, 0, 1, 1, 0]})
    print("Sample dataset created")

# Display basic info
print("\n=== Dataset Info ===")
print(df.info())
print("\n=== First 5 rows ===")
print(df.head())

# Basic statistics
print("\n=== Label Distribution ===")
label_counts = df['label'].value_counts()
print(label_counts)
print(f"\nBenign: {label_counts.get(0, 0)} URLs")
print(f"Malicious: {label_counts.get(1, 0)} URLs")

# Heuristic Feature Extraction Functions
def extract_features(url):
    """Extract basic heuristic features from URL"""
    features = {}
    
    # Basic length features
    features['url_length'] = len(url)
    
    # Parse URL
    try:
        parsed = urlparse(url)
        hostname = parsed.hostname or ''
        path = parsed.path or ''
        
        # Domain features
        features['domain_length'] = len(hostname)
        features['dot_count'] = hostname.count('.')
        features['hyphen_count'] = hostname.count('-')
        
        # Check for IP
        ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
        features['has_ip'] = 1 if re.search(ip_pattern, hostname) else 0
        
        # Suspicious TLDs
        suspicious_tlds = ['.xyz', '.top', '.loan', '.tk', '.ml']
        tld = '.' + hostname.split('.')[-1] if '.' in hostname else ''
        features['suspicious_tld'] = 1 if tld in suspicious_tlds else 0
        
        # Suspicious keywords
        suspicious_keywords = ['login', 'secure', 'verify', 'bank', 'account']
        url_lower = url.lower()
        features['suspicious_keyword'] = 1 if any(kw in url_lower for kw in suspicious_keywords) else 0
        
        # Special characters
        features['has_special_chars'] = 1 if any(c in hostname for c in '@#$%&') else 0
        
        # Digit ratio
        digits = re.findall(r'\d', url)
        features['digit_ratio'] = len(digits) / len(url) if url else 0
        
    except:
        # If URL parsing fails, set default values
        for key in ['domain_length', 'dot_count', 'hyphen_count', 'has_ip', 
                   'suspicious_tld', 'suspicious_keyword', 'has_special_chars', 'digit_ratio']:
            features[key] = 0
    
    return features

# Apply feature extraction
print("\n=== Extracting Features ===")
features_list = []
for url in df['url']:
    features_list.append(extract_features(url))

features_df = pd.DataFrame(features_list)
df_features = pd.concat([df, features_df], axis=1)

print(f"Extracted {len(features_df.columns)} features")
print("\nFeature columns:", features_df.columns.tolist())

# Visualizations
print("\n=== Creating Visualizations ===")

# 1. Feature distributions by label
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

feature_cols = ['url_length', 'domain_length', 'dot_count', 'hyphen_count', 
                'digit_ratio', 'has_ip', 'suspicious_tld', 'suspicious_keyword']

for idx, feature in enumerate(feature_cols[:8]):
    ax = axes[idx]
    
    # Separate by label
    benign_data = df_features[df_features['label'] == 0][feature]
    malicious_data = df_features[df_features['label'] == 1][feature]
    
    # Create box plot
    data_to_plot = [benign_data.dropna(), malicious_data.dropna()]
    ax.boxplot(data_to_plot, labels=['Benign', 'Malicious'])
    ax.set_title(f'{feature} Distribution')
    ax.set_ylabel('Value')
    ax.grid(True, alpha=0.3)

# Hide empty subplot
axes[8].axis('off')

plt.suptitle('Feature Distributions: Benign vs Malicious URLs', fontsize=16, y=1.02)
plt.tight_layout()
plt.savefig('results/feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

# 2. Correlation heatmap
print("\n=== Feature Correlation ===")
correlation_matrix = df_features[feature_cols + ['label']].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.savefig('results/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# 3. Feature importance analysis
print("\n=== Feature Importance Analysis ===")

# Calculate mean difference between malicious and benign
feature_importance = {}
for feature in feature_cols:
    mean_benign = df_features[df_features['label'] == 0][feature].mean()
    mean_malicious = df_features[df_features['label'] == 1][feature].mean()
    diff = abs(mean_malicious - mean_benign)
    feature_importance[feature] = diff

# Sort by importance
feature_importance_sorted = dict(sorted(feature_importance.items(), 
                                        key=lambda x: x[1], reverse=True))

print("\nFeature Importance (Difference between malicious and benign):")
for feature, importance in feature_importance_sorted.items():
    print(f"{feature}: {importance:.4f}")

# Plot feature importance
plt.figure(figsize=(12, 6))
features = list(feature_importance_sorted.keys())
importances = list(feature_importance_sorted.values())

bars = plt.barh(features, importances, color='skyblue')
plt.xlabel('Difference (Malicious - Benign)')
plt.title('Feature Importance for Malicious URL Detection', fontsize=16)
plt.gca().invert_yaxis()

# Add value labels
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
             f'{width:.4f}', ha='left', va='center')

plt.tight_layout()
plt.savefig('results/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# 4. Statistical analysis
print("\n=== Statistical Analysis ===")

for feature in feature_cols:
    benign = df_features[df_features['label'] == 0][feature]
    malicious = df_features[df_features['label'] == 1][feature]
    
    if len(benign) > 1 and len(malicious) > 1:
        from scipy import stats
        
        # T-test
        t_stat, p_value = stats.ttest_ind(benign, malicious, equal_var=False)
        
        print(f"\n{feature}:")
        print(f"  Benign mean: {benign.mean():.4f}")
        print(f"  Malicious mean: {malicious.mean():.4f}")
        print(f"  T-statistic: {t_stat:.4f}")
        print(f"  P-value: {p_value:.4f}")
        print(f"  Significant: {'Yes' if p_value < 0.05 else 'No'}")

# Save processed data
print("\n=== Saving Processed Data ===")
df_features.to_csv('data/processed/urls_with_features.csv', index=False)
print("Data saved to 'data/processed/urls_with_features.csv'")

print("\n=== Analysis Complete ===")
print(f"Total URLs analyzed: {len(df_features)}")
print(f"Features extracted: {len(feature_cols)}")
print(f"Visualizations saved to 'results/' folder")