# URL PhishGuard: Complete Feature Engineering Pipeline

## Integrated Pipeline Overview

This notebook executes the complete phishing detection feature engineering pipeline, integrating all team member contributions:

**Pipeline Flow:**
1. **M1 (IT24103625)**: URL Length & Hostname Length → `m1_url_length_features.csv`
2. **M2 (IT24100950)**: Character Counts (uses M1 output) → `m2_character_features.csv`
3. **M3 (IT24103925)**: IP Detection & Digit Density (uses M2 output) → `m3_ip_features.csv`
4. **M4 (IT24103016)**: Subdomain & Path Depth (uses M3 output) → `m4_structure_features.csv`
5. **M5 (IT24100659)**: MinMax Scaling (uses M4 output) → `m5_scaled_features.csv`
6. **M6 (IT24104208)**: Feature Selection & Label Encoding (uses M5 output) → `m6_final_*.csv`

**Key Benefits:**
- **Sequential Processing**: Each member builds on previous work
- **No Duplication**: Each module focuses on specific feature types
- **Maintainable**: Clear separation of concerns
- **Scalable**: Easy to add new feature engineering steps

In [None]:
# Import required libraries for pipeline execution
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from datetime import datetime
import warnings

# Set up environment
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

print("🚀 URL PhishGuard Pipeline - Starting Complete Analysis")
print(f"📅 Execution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)

## Pipeline Configuration & Data Validation

In [None]:
# Pipeline configuration
pipeline_config = {
    'data_path': '../data/raw/phishing_site_urls.csv',
    'output_dir': '../results/outputs',
    'modules': [
        {'id': 'M1', 'name': 'URL Length Analysis', 'student': 'IT24103625', 'output': 'm1_url_length_features.csv'},
        {'id': 'M2', 'name': 'Character Counts', 'student': 'IT24100950', 'output': 'm2_character_features.csv'},
        {'id': 'M3', 'name': 'IP Detection', 'student': 'IT24103925', 'output': 'm3_ip_features.csv'},
        {'id': 'M4', 'name': 'Structure Analysis', 'student': 'IT24103016', 'output': 'm4_structure_features.csv'},
        {'id': 'M5', 'name': 'Scaling & Normalization', 'student': 'IT24100659', 'output': 'm5_scaled_features.csv'},
        {'id': 'M6', 'name': 'Feature Selection', 'student': 'IT24104208', 'output': 'm6_final_selected_features.csv'}
    ]
}

# Create output directory
os.makedirs(pipeline_config['output_dir'], exist_ok=True)

# Validate initial dataset
print("📊 Initial Dataset Validation")
print("-" * 30)

if os.path.exists(pipeline_config['data_path']):
    df_initial = pd.read_csv(pipeline_config['data_path'])
    print(f"✅ Dataset loaded successfully")
    print(f"   📈 Shape: {df_initial.shape}")
    print(f"   📋 Columns: {df_initial.columns.tolist()}")
    print(f"   🎯 Labels: {df_initial['Label'].value_counts().to_dict()}")
    print(f"   🔍 Missing values: {df_initial.isnull().sum().sum()}")
    print(f"   💾 Data size: {df_initial.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
else:
    raise FileNotFoundError(f"❌ Dataset not found at: {pipeline_config['data_path']}")

print(f"\n🔧 Pipeline Configuration:")
for module in pipeline_config['modules']:
    print(f"   {module['id']}: {module['name']} ({module['student']})")

## Pipeline Execution Status Checker

In [None]:
# Check pipeline execution status
def check_pipeline_status():
    print("🔍 Checking Pipeline Execution Status")
    print("-" * 40)
    
    pipeline_status = []
    
    for module in pipeline_config['modules']:
        output_path = os.path.join(pipeline_config['output_dir'], module['output'])
        
        if os.path.exists(output_path):
            df_temp = pd.read_csv(output_path)
            status = "✅ Complete"
            details = f"Shape: {df_temp.shape}"
        else:
            status = "❌ Missing"
            details = "Output file not found"
        
        pipeline_status.append({
            'Module': module['id'],
            'Name': module['name'],
            'Student': module['student'],
            'Status': status,
            'Details': details
        })
        
        print(f"{module['id']}: {module['name']:<25} {status} - {details}")
    
    return pipeline_status

status_report = check_pipeline_status()

## Execute Individual Modules

**Note**: Run individual notebooks first if outputs are missing:
1. `IT24103625_M1_URL_Length.ipynb`
2. `IT24100950_M2_Char_Counts.ipynb`
3. `IT24103925_M3_IP_Detection.ipynb`
4. `IT24103016_M4_Depth_Count.ipynb`
5. `IT24100659_M5_Scaling_Norm.ipynb`
6. `IT24104208_M6_Selection_Encoding.ipynb`

In [None]:
# Load and analyze each pipeline stage
def analyze_pipeline_progression():
    print("📊 Pipeline Progression Analysis")
    print("=" * 40)
    
    pipeline_data = {}
    
    # Load each stage's output if available
    for module in pipeline_config['modules']:
        output_path = os.path.join(pipeline_config['output_dir'], module['output'])
        
        if os.path.exists(output_path):
            df = pd.read_csv(output_path)
            
            # Analyze feature growth
            numeric_cols = [col for col in df.columns if col not in ['URL', 'Label', 'Label_Encoded']]
            
            pipeline_data[module['id']] = {
                'data': df,
                'shape': df.shape,
                'features': numeric_cols,
                'feature_count': len(numeric_cols)
            }
            
            print(f"{module['id']}: {module['name']}")
            print(f"   📏 Shape: {df.shape}")
            print(f"   🔢 Features: {len(numeric_cols)}")
            
            if len(numeric_cols) <= 10:
                print(f"   📋 Feature List: {numeric_cols}")
            else:
                print(f"   📋 Sample Features: {numeric_cols[:5]} ... (+{len(numeric_cols)-5} more)")
            
            print()
        else:
            print(f"⚠️ {module['id']} output not found - run {module['id']} notebook first")
    
    return pipeline_data

pipeline_data = analyze_pipeline_progression()

## Pipeline Visualization & Summary

In [None]:
# Visualize pipeline progression
if len(pipeline_data) > 0:
    print("📈 Creating Pipeline Visualization")
    
    # Create comprehensive pipeline visualization
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Feature Count Progression
    modules_completed = list(pipeline_data.keys())
    feature_counts = [pipeline_data[module]['feature_count'] for module in modules_completed]
    
    ax1.plot(modules_completed, feature_counts, 'bo-', linewidth=3, markersize=8)
    ax1.fill_between(modules_completed, feature_counts, alpha=0.3)
    ax1.set_xlabel('Pipeline Module', fontweight='bold')
    ax1.set_ylabel('Number of Features', fontweight='bold')
    ax1.set_title('Feature Engineering Progression', fontweight='bold', pad=20)
    ax1.grid(True, alpha=0.3)
    
    # Add value labels
    for i, (module, count) in enumerate(zip(modules_completed, feature_counts)):
        ax1.annotate(f'{count}', (i, count), textcoords="offset points", 
                    xytext=(0,10), ha='center', fontweight='bold')
    
    # 2. Data Quality Check (if final data available)
    if 'M6' in pipeline_data:
        final_data = pipeline_data['M6']['data']
        label_dist = final_data['Label'].value_counts()
        
        colors = ['#2E8B57', '#DC143C']
        wedges, texts, autotexts = ax2.pie(label_dist.values, labels=label_dist.index, 
                                          autopct='%1.1f%%', colors=colors, startangle=90)
        ax2.set_title('Final Dataset Label Distribution', fontweight='bold')
        
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
    
    # 3. Feature Types Analysis
    if 'M6' in pipeline_data:
        feature_categories = {
            'Length Features': ['url_length', 'hostname_length'],
            'Character Counts': ['count_at', 'count_double_slash', 'count_dash', 'count_underscore'],
            'IP & Digits': ['has_ip', 'digit_density'],
            'Structure': ['subdomain_count', 'path_depth', 'query_params_count'],
            'Other': []
        }
        
        all_features = pipeline_data['M6']['features']
        categorized = set()
        
        for category, features in feature_categories.items():
            count = sum(1 for f in features if f in all_features)
            categorized.update(f for f in features if f in all_features)
            feature_categories[category] = count
        
        # Count uncategorized features
        feature_categories['Other'] = len(all_features) - len(categorized)
        
        categories = list(feature_categories.keys())
        counts = list(feature_categories.values())
        
        bars = ax3.bar(categories, counts, color=plt.cm.Set3(np.linspace(0, 1, len(categories))))
        ax3.set_xlabel('Feature Category', fontweight='bold')
        ax3.set_ylabel('Feature Count', fontweight='bold')
        ax3.set_title('Feature Categories Distribution', fontweight='bold')
        ax3.tick_params(axis='x', rotation=45)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            if height > 0:
                ax3.text(bar.get_x() + bar.get_width()/2., height,
                        f'{int(height)}', ha='center', va='bottom', fontweight='bold')
    
    # 4. Pipeline Execution Status
    module_names = [m['id'] for m in pipeline_config['modules']]
    completion_status = [1 if m['id'] in pipeline_data else 0 for m in pipeline_config['modules']]
    
    colors_status = ['green' if status else 'red' for status in completion_status]
    bars = ax4.bar(module_names, completion_status, color=colors_status, alpha=0.7)
    ax4.set_xlabel('Pipeline Modules', fontweight='bold')
    ax4.set_ylabel('Completion Status', fontweight='bold')
    ax4.set_title('Module Execution Status', fontweight='bold')
    ax4.set_ylim(0, 1.2)
    
    # Add status labels
    for i, (bar, status) in enumerate(zip(bars, completion_status)):
        label = "✅" if status else "❌"
        ax4.text(bar.get_x() + bar.get_width()/2., 0.5, label,
                ha='center', va='center', fontsize=20)
    
    plt.suptitle('URL PhishGuard: Complete Pipeline Analysis', fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.show()

else:
    print("⚠️ No pipeline outputs found. Please run individual module notebooks first.")

## Final Pipeline Summary & Recommendations

In [None]:
# Generate comprehensive pipeline summary
def generate_pipeline_summary():
    print("📋 FINAL PIPELINE SUMMARY")
    print("=" * 50)
    
    print(f"📅 Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"📊 Initial Dataset: {df_initial.shape[0]:,} URLs")
    
    completed_modules = len(pipeline_data)
    total_modules = len(pipeline_config['modules'])
    completion_rate = (completed_modules / total_modules) * 100
    
    print(f"\n🚀 PIPELINE EXECUTION:")
    print(f"   ✅ Completed Modules: {completed_modules}/{total_modules} ({completion_rate:.0f}%)")
    
    if completed_modules == total_modules:
        final_data = pipeline_data['M6']['data']
        final_features = pipeline_data['M6']['features']
        
        print(f"   🎯 Final Dataset: {final_data.shape}")
        print(f"   🔢 Total Features: {len(final_features)}")
        print(f"   📊 Label Distribution: {final_data['Label'].value_counts().to_dict()}")
        
        # Feature engineering summary
        print(f"\n🔧 FEATURE ENGINEERING BREAKDOWN:")
        for module in pipeline_config['modules']:
            if module['id'] in pipeline_data:
                feature_count = pipeline_data[module['id']]['feature_count']
                print(f"   {module['id']}: {feature_count:2d} features - {module['name']}")
        
        print(f"\n📈 PIPELINE EFFECTIVENESS:")
        initial_features = 0  # Started with just URL and Label
        final_feature_count = len(final_features)
        print(f"   📊 Features Created: {final_feature_count} features")
        print(f"   🎯 Data Transformation: Raw URLs → ML-ready features")
        print(f"   ⚖️ Feature Scaling: Applied (MinMax normalization)")
        print(f"   🔍 Feature Selection: Applied (Chi-squared test)")
        print(f"   🏷️ Label Encoding: Applied (good=0, bad=1)")
        
        print(f"\n✅ PIPELINE STATUS: COMPLETE - Ready for Machine Learning")
        print(f"\n📁 OUTPUT FILES:")
        for module in pipeline_config['modules']:
            if module['id'] in pipeline_data:
                print(f"   📄 {module['output']}")
        
        print(f"\n🚀 NEXT STEPS:")
        print(f"   1. Load final dataset: 'm6_final_selected_features.csv'")
        print(f"   2. Split into training/testing sets")
        print(f"   3. Train machine learning models (Random Forest, SVM, etc.)")
        print(f"   4. Evaluate model performance")
        print(f"   5. Deploy phishing detection system")
        
    else:
        missing_modules = [m['id'] for m in pipeline_config['modules'] if m['id'] not in pipeline_data]
        print(f"   ❌ Missing Modules: {missing_modules}")
        print(f"\n⚠️ ACTION REQUIRED:")
        print(f"   Please run the following notebooks to complete the pipeline:")
        for module in pipeline_config['modules']:
            if module['id'] not in pipeline_data:
                notebook_name = f"{module['student']}_{module['id']}_*.ipynb"
                print(f"   📔 {notebook_name}")
    
    print(f"\n" + "=" * 50)

generate_pipeline_summary()

## Quick ML Model Demo (if pipeline complete)

In [None]:
# Quick ML demonstration if pipeline is complete
if 'M6' in pipeline_data:
    print("🤖 QUICK MACHINE LEARNING DEMO")
    print("=" * 40)
    
    try:
        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import classification_report, confusion_matrix
        from sklearn.preprocessing import LabelEncoder
        
        # Load final dataset
        final_data = pipeline_data['M6']['data']
        
        # Prepare features and target
        feature_columns = [col for col in final_data.columns if col not in ['URL', 'Label', 'Label_Encoded']]
        X = final_data[feature_columns]
        
        # Use label-encoded target or create it
        if 'Label_Encoded' in final_data.columns:
            y = final_data['Label_Encoded']
        else:
            le = LabelEncoder()
            y = le.fit_transform(final_data['Label'])
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        print(f"📊 Training Data: {X_train.shape}")
        print(f"📊 Testing Data: {X_test.shape}")
        print(f"🔢 Features Used: {len(feature_columns)}")
        
        # Train Random Forest model
        print(f"\n🌲 Training Random Forest Classifier...")
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        rf_model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = rf_model.predict(X_test)
        
        # Calculate accuracy
        accuracy = (y_pred == y_test).mean()
        print(f"✅ Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'Feature': feature_columns,
            'Importance': rf_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print(f"\n🔍 Top 5 Most Important Features:")
        for i, (_, row) in enumerate(feature_importance.head(5).iterrows(), 1):
            print(f"   {i}. {row['Feature']:<20} ({row['Importance']:.4f})")
        
        print(f"\n🎯 PIPELINE SUCCESS: Features successfully predict phishing URLs!")
        
    except ImportError:
        print("⚠️ scikit-learn not available for ML demo")
    except Exception as e:
        print(f"⚠️ ML demo error: {e}")

else:
    print("🤖 ML Demo: Complete pipeline required (run all M1-M6 notebooks first)")

In [None]:
print("🏁 PIPELINE EXECUTION COMPLETE")
print("🎯 URL PhishGuard feature engineering pipeline analysis finished")
print(f"📅 Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")