# M6 - Label Encoding & Feature Selection

**Student ID**: IT24104208  
**Focus**: Encode labels and select top features using Chi-squared test  
**Visualization**: Feature importance bar plot showing χ² scores  
**Input**: M5 output (Scaled features)  
**Output**: Final processed dataset ready for machine learning

## Overview

1. Encode categorical labels to numeric format
2. Apply Chi-squared feature selection to identify most discriminative features
3. Create feature importance visualization
4. Prepare final dataset for machine learning models

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import train_test_split
import os
import warnings

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load data from M5 (Scaled features)
m5_output_path = '../results/outputs/m5_scaled_features.csv'

if os.path.exists(m5_output_path):
    print("Loading M5 output (Scaled features)...")
    df = pd.read_csv(m5_output_path)
    print(f"Loaded M5 data: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
else:
    print("M5 output not found, loading raw data...")
    df = pd.read_csv('../data/raw/phishing_site_urls.csv')
    print(f"Loaded raw data: {df.shape}")

print(f"\nDataset shape: {df.shape}")
print(f"Label distribution:\n{df['Label'].value_counts()}")

# Display sample of the scaled data
numeric_cols = [col for col in df.columns if col not in ['URL', 'Label']]
print(f"\nNumeric features available: {len(numeric_cols)}")
print(f"Features: {numeric_cols}")
print(f"\nSample of scaled data:")
print(df[numeric_cols].head())

## Label Encoding

Convert categorical labels ('good', 'bad') to numeric format for machine learning algorithms.

In [None]:
# M6 Step 1: Label Encoding
print("M6 Step 1: Label Encoding")
print("="*40)

# Check original label distribution
print(f"Original labels:")
label_counts = df['Label'].value_counts()
print(label_counts)
print(f"\nLabel percentages:")
print(df['Label'].value_counts(normalize=True).round(4) * 100)

# Apply Label Encoding
label_encoder = LabelEncoder()
df['Label_Encoded'] = label_encoder.fit_transform(df['Label'])

print(f"\nLabel Encoding Results:")
print(f"Label mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# Verify encoding
encoding_check = df[['Label', 'Label_Encoded']].drop_duplicates().sort_values('Label_Encoded')
print(f"\nEncoding verification:")
print(encoding_check)

# Check encoded label distribution
print(f"\nEncoded label distribution:")
print(df['Label_Encoded'].value_counts().sort_index())

print("Label encoding completed successfully!")

## Feature Selection using Chi-squared Test

Apply statistical feature selection to identify the most discriminative features for phishing detection.

In [None]:
# M6 Step 2: Feature Selection using Chi-squared test
print("M6 Step 2: Feature Selection using Chi-squared Test")
print("="*60)

# Prepare data for feature selection
X = df[numeric_cols]
y = df['Label_Encoded']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Ensure all features are non-negative for chi-squared test
# (Chi-squared test requires non-negative values)
print(f"\nChecking feature value ranges:")
feature_ranges = pd.DataFrame({
    'Feature': numeric_cols,
    'Min': [X[col].min() for col in numeric_cols],
    'Max': [X[col].max() for col in numeric_cols],
    'Mean': [X[col].mean() for col in numeric_cols]
})
print(feature_ranges.round(4))

# Handle negative values if any (shift to make all values non-negative)
X_chi = X.copy()
for col in numeric_cols:
    if X_chi[col].min() < 0:
        X_chi[col] = X_chi[col] - X_chi[col].min()
        print(f"Adjusted {col} to make non-negative")

# Apply Chi-squared feature selection
print(f"\nApplying Chi-squared feature selection...")
chi2_selector = SelectKBest(score_func=chi2, k='all')
X_chi2 = chi2_selector.fit_transform(X_chi, y)

# Get Chi-squared scores
chi2_scores = chi2_selector.scores_
chi2_pvalues = chi2_selector.pvalues_

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': numeric_cols,
    'Chi2_Score': chi2_scores,
    'P_Value': chi2_pvalues
}).sort_values('Chi2_Score', ascending=False)

print(f"\nChi-squared Feature Importance (Top 10):")
print(feature_importance.head(10).round(6))

# Select top K features (let's choose top 10 most important features)
k_best = min(10, len(numeric_cols))
top_features = feature_importance.head(k_best)['Feature'].tolist()

print(f"\nSelected Top {k_best} Features:")
for i, feature in enumerate(top_features, 1):
    score = feature_importance[feature_importance['Feature'] == feature]['Chi2_Score'].iloc[0]
    print(f"{i:2d}. {feature:<25} (χ² = {score:.2f})")

print("Feature selection completed successfully!")

## Additional Feature Analysis

Apply mutual information for comparison and comprehensive feature analysis.

In [None]:
# Additional analysis: Mutual Information for comparison
print("Additional Analysis: Mutual Information Feature Selection")
print("="*60)

# Apply Mutual Information feature selection
mi_selector = SelectKBest(score_func=mutual_info_classif, k='all')
X_mi = mi_selector.fit_transform(X, y)

# Get Mutual Information scores
mi_scores = mi_selector.scores_

# Create comprehensive feature analysis
comprehensive_analysis = pd.DataFrame({
    'Feature': numeric_cols,
    'Chi2_Score': chi2_scores,
    'Chi2_Rank': range(1, len(chi2_scores) + 1),
    'MI_Score': mi_scores,
    'Mean_Good': [df[df['Label'] == 'good'][col].mean() for col in numeric_cols],
    'Mean_Bad': [df[df['Label'] == 'bad'][col].mean() for col in numeric_cols]
})

# Sort by Chi-squared score for ranking
comprehensive_analysis = comprehensive_analysis.sort_values('Chi2_Score', ascending=False)
comprehensive_analysis['Chi2_Rank'] = range(1, len(comprehensive_analysis) + 1)

# Sort by Mutual Information score for ranking
comprehensive_analysis_mi = comprehensive_analysis.sort_values('MI_Score', ascending=False)
comprehensive_analysis_mi['MI_Rank'] = range(1, len(comprehensive_analysis_mi) + 1)

# Merge rankings
comprehensive_analysis = comprehensive_analysis.merge(
    comprehensive_analysis_mi[['Feature', 'MI_Rank']], on='Feature'
).sort_values('Chi2_Score', ascending=False)

print(f"\nComprehensive Feature Analysis:")
print(comprehensive_analysis.round(4))

print(f"\nTop Features Comparison:")
print(f"{'Rank':<4} {'Chi² Top Features':<25} {'MI Top Features':<25}")
print("-" * 60)
mi_top_features = comprehensive_analysis.sort_values('MI_Score', ascending=False)['Feature'].head(k_best).tolist()
for i in range(k_best):
    chi2_feat = top_features[i] if i < len(top_features) else ""
    mi_feat = mi_top_features[i] if i < len(mi_top_features) else ""
    print(f"{i+1:<4} {chi2_feat:<25} {mi_feat:<25}")

## Visualization: Required Feature Importance Bar Plot

Create the main visualization: Feature importance bar plot showing χ² scores

In [None]:
# Create comprehensive visualization with required feature importance plot
fig = plt.figure(figsize=(16, 12))

# Main required visualization: Feature importance bar plot (Chi-squared scores)
ax1 = plt.subplot(2, 2, (1, 2))  # Spans 2 columns

# Plot top features by Chi-squared score
top_n = min(15, len(feature_importance))  # Show top 15 features
plot_data = feature_importance.head(top_n)

bars = ax1.barh(range(len(plot_data)), plot_data['Chi2_Score'], 
                color=plt.cm.viridis(np.linspace(0, 1, len(plot_data))))

ax1.set_yticks(range(len(plot_data)))
ax1.set_yticklabels(plot_data['Feature'])
ax1.set_xlabel('Chi-squared Score (χ²)', fontsize=12, fontweight='bold')
ax1.set_title('Feature Importance Ranking using Chi-squared Test\n(M6 Required Visualization)', 
              fontsize=14, fontweight='bold', pad=20)
ax1.grid(axis='x', alpha=0.3)

# Add value labels on bars
for i, (bar, score) in enumerate(zip(bars, plot_data['Chi2_Score'])):
    ax1.text(score + max(plot_data['Chi2_Score']) * 0.01, bar.get_y() + bar.get_height()/2,
             f'{score:.1f}', ha='left', va='center', fontweight='bold')

# Invert y-axis to show highest score at top
ax1.invert_yaxis()

# Feature Selection Comparison
ax2 = plt.subplot(2, 2, 3)
comparison_data = comprehensive_analysis.head(10)
x_pos = np.arange(len(comparison_data))

# Normalize scores for comparison
chi2_norm = comparison_data['Chi2_Score'] / comparison_data['Chi2_Score'].max()
mi_norm = comparison_data['MI_Score'] / comparison_data['MI_Score'].max()

width = 0.35
ax2.bar(x_pos - width/2, chi2_norm, width, label='Chi² (normalized)', color='skyblue')
ax2.bar(x_pos + width/2, mi_norm, width, label='Mutual Info (normalized)', color='lightcoral')

ax2.set_xlabel('Features (Top 10)', fontweight='bold')
ax2.set_ylabel('Normalized Score', fontweight='bold')
ax2.set_title('Feature Selection Method Comparison', fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels([f.replace('_', '\n') for f in comparison_data['Feature']], rotation=45, ha='right')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

# Label Distribution
ax3 = plt.subplot(2, 2, 4)
label_counts = df['Label'].value_counts()
colors = ['#2E8B57', '#DC143C']  # Green for good, red for bad
wedges, texts, autotexts = ax3.pie(label_counts.values, labels=label_counts.index, 
                                  autopct='%1.1f%%', colors=colors, startangle=90)

ax3.set_title('Label Distribution\n(Original Dataset)', fontweight='bold')

# Make autopct text bold
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(12)

plt.suptitle('M6: Label Encoding & Feature Selection Analysis', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

# Print key insights
print(f"\nKey Insights from M6 Analysis:")
print(f"{'='*50}")
print(f"1. Label Encoding:")
print(f"   - 'bad' URLs encoded as: {label_encoder.transform(['bad'])[0]}")
print(f"   - 'good' URLs encoded as: {label_encoder.transform(['good'])[0]}")
print(f"   - Total samples: {len(df):,}")

print(f"\n2. Feature Selection Results:")
print(f"   - Total features analyzed: {len(numeric_cols)}")
print(f"   - Top {k_best} features selected based on Chi-squared test")
print(f"   - Highest χ² score: {feature_importance.iloc[0]['Chi2_Score']:.2f} ({feature_importance.iloc[0]['Feature']})")
print(f"   - Most significant p-value: {feature_importance['P_Value'].min():.2e}")

print(f"\n3. Top 5 Most Discriminative Features:")
for i, (_, row) in enumerate(feature_importance.head(5).iterrows(), 1):
    print(f"   {i}. {row['Feature']:<20} (χ² = {row['Chi2_Score']:.2f}, p = {row['P_Value']:.2e})")

if feature_importance.iloc[0]['P_Value'] < 0.001:
    print(f"\nStrong statistical significance detected - features are highly discriminative!")
else:
    print(f"\nModerate statistical significance - consider additional feature engineering")

## Final Dataset Preparation

Create the final processed dataset ready for machine learning models.

In [None]:
# Prepare final datasets
print("M6 Final Step: Dataset Preparation")
print("="*40)

# Create dataset with all features (scaled)
df_all_features = df[['URL', 'Label', 'Label_Encoded'] + numeric_cols].copy()

# Create dataset with selected features only
df_selected_features = df[['URL', 'Label', 'Label_Encoded'] + top_features].copy()

print(f"Final Dataset Information:")
print(f"All features dataset shape: {df_all_features.shape}")
print(f"Selected features dataset shape: {df_selected_features.shape}")
print(f"Features reduced from {len(numeric_cols)} to {len(top_features)} ({len(top_features)/len(numeric_cols)*100:.1f}%)")

# Create train-test split for demonstration
X_all = df_all_features[numeric_cols]
X_selected = df_selected_features[top_features]
y = df_all_features['Label_Encoded']

X_train_all, X_test_all, y_train, y_test = train_test_split(
    X_all, y, test_size=0.2, random_state=42, stratify=y
)

X_train_sel, X_test_sel, _, _ = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain-Test Split:")
print(f"Training set: {X_train_all.shape[0]:,} samples")
print(f"Test set: {X_test_all.shape[0]:,} samples")
print(f"Class distribution in training set:")
print(pd.Series(y_train).value_counts().sort_index())

# Feature summary for final dataset
print(f"\nSelected Features Summary:")
selected_summary = comprehensive_analysis[comprehensive_analysis['Feature'].isin(top_features)].copy()
selected_summary = selected_summary.sort_values('Chi2_Score', ascending=False)
print(selected_summary[['Feature', 'Chi2_Score', 'MI_Score', 'Mean_Good', 'Mean_Bad']].round(4))

# Save final datasets
output_dir = '../results/outputs'
os.makedirs(output_dir, exist_ok=True)

# Save complete processed dataset
all_features_path = os.path.join(output_dir, 'm6_final_all_features.csv')
df_all_features.to_csv(all_features_path, index=False)
print(f"\nComplete dataset saved to: {all_features_path}")

# Save selected features dataset
selected_features_path = os.path.join(output_dir, 'm6_final_selected_features.csv')
df_selected_features.to_csv(selected_features_path, index=False)
print(f"Selected features dataset saved to: {selected_features_path}")

# Save feature importance analysis
importance_path = os.path.join(output_dir, 'm6_feature_importance.csv')
comprehensive_analysis.to_csv(importance_path, index=False)
print(f"Feature importance analysis saved to: {importance_path}")

# Save label encoder
import pickle
encoder_path = os.path.join(output_dir, 'm6_label_encoder.pkl')
with open(encoder_path, 'wb') as f:
    pickle.dump(label_encoder, f)
print(f"🔧 Label encoder saved to: {encoder_path}")

print(f"\nReady for Machine Learning Pipeline Integration")
print(f"M6 Analysis Complete: Label encoding and feature selection successfully completed")

# Final summary
print(f"\nPIPELINE READY FOR MACHINE LEARNING:")
print(f"   • Dataset: {len(df):,} URLs processed")
print(f"   • Features: {len(top_features)} selected from {len(numeric_cols)} total")
print(f"   • Labels: Encoded (0=good, 1=bad)")
print(f"   • Data: Scaled and normalized")
print(f"   • Quality: Statistical significance confirmed")