In [None]:
# ============================================
# STEP 1: LOAD THE DATASET
# ============================================

import os
os.environ['HF_HOME'] = 'E:/.cache/huggingface'

from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Loading PhreshPhish dataset...")
dataset = load_dataset("phreshphish/phreshphish", cache_dir='E:/.cache/huggingface')

print("\nDataset loaded successfully!")
print(f"Available splits: {dataset.keys()}")

# ============================================
# STEP 2: DATASET SIZE INFO
# ============================================

print("\n" + "="*60)
print("DATASET SIZE")
print("="*60)
print(f"Train set size: {len(dataset['train']):,} samples")
print(f"Test set size: {len(dataset['test']):,} samples")
print(f"Total samples: {len(dataset['train']) + len(dataset['test']):,}")

print("\nDataset is HUGE (>100GB with HTML). Using 5000 samples for EDA...")
SAMPLE_SIZE = 10000

train_sample = dataset['train'].select(range(min(SAMPLE_SIZE, len(dataset['train']))))
test_sample = dataset['test'].select(range(min(SAMPLE_SIZE, len(dataset['test']))))

# Convert samples to pandas
train_df = pd.DataFrame(train_sample)
test_df = pd.DataFrame(test_sample)

print(f"Working with {len(train_df):,} train samples and {len(test_df):,} test samples")

# ============================================
# STEP 3: BASIC DATASET STRUCTURE
# ============================================

print("\n" + "="*60)
print("DATASET STRUCTURE")
print("="*60)

print("\nColumn names:")
print(train_df.columns.tolist())

print("\nData types:")
print(train_df.dtypes)

print("\nDataset shape (rows, columns):")
print(f"Train sample: {train_df.shape}")
print(f"Test sample: {test_df.shape}")

# ============================================
# STEP 4: LOOK AT SAMPLE ROWS (WITHOUT HTML)
# ============================================

print("\n" + "="*60)
print("SAMPLE DATA (First 3 rows, excluding HTML)")
print("="*60)

# Show all columns except HTML (which is too large)
cols_to_show = [col for col in train_df.columns if col != 'html']
print(train_df[cols_to_show].head(3))

print("\n" + "="*60)
print("SAMPLE DATA (Random 3 rows, excluding HTML)")
print("="*60)
print(train_df[cols_to_show].sample(3, random_state=42))

# ============================================
# STEP 5: HTML COLUMN ANALYSIS
# ============================================

print("\n" + "="*60)
print("HTML COLUMN SIZE ANALYSIS")
print("="*60)

if 'html' in train_df.columns:
    html_lengths = train_df['html'].str.len()
    print(f"Average HTML length: {html_lengths.mean():,.0f} characters ({html_lengths.mean()/1024:,.1f} KB)")
    print(f"Max HTML length: {html_lengths.max():,.0f} characters ({html_lengths.max()/1024/1024:,.1f} MB)")
    print(f"Min HTML length: {html_lengths.min():,.0f} characters ({html_lengths.min()/1024:,.1f} KB)")
    print(f"Median HTML length: {html_lengths.median():,.0f} characters ({html_lengths.median()/1024:,.1f} KB)")

# ============================================
# STEP 6: TARGET VARIABLE ANALYSIS
# ============================================

print("\n" + "="*60)
print("TARGET VARIABLE ANALYSIS")
print("="*60)

# Find target column
target_col = None
for col in train_df.columns:
    if col.lower() in ['label', 'target', 'is_phishing', 'phishing', 'class']:
        target_col = col
        break

if target_col:
    print(f"\nTarget column found: '{target_col}'")
    print(f"\nClass distribution (Train sample):")
    print(train_df[target_col].value_counts())
    print(f"\nClass distribution (% in sample):")
    print(train_df[target_col].value_counts(normalize=True) * 100)
    
    print(f"\nClass distribution (Test sample):")
    print(test_df[target_col].value_counts())
    
    # Visualize class distribution
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    train_df[target_col].value_counts().plot(kind='bar', color=['#2ecc71', '#e74c3c'])
    plt.title('Class Distribution - Train Sample', fontsize=14, fontweight='bold')
    plt.xlabel('Class', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=0)
    
    plt.subplot(1, 2, 2)
    test_df[target_col].value_counts().plot(kind='bar', color=['#2ecc71', '#e74c3c'])
    plt.title('Class Distribution - Test Sample', fontsize=14, fontweight='bold')
    plt.xlabel('Class', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=0)
    
    plt.tight_layout()
    plt.show()
else:
    print("Target column not found. Inspecting column values...")
    for col in train_df.columns:
        if col != 'html':
            print(f"\n{col}: {train_df[col].unique()[:5]}")

# ============================================
# STEP 7: CHECK FOR MISSING VALUES
# ============================================

print("\n" + "="*60)
print("MISSING VALUES ANALYSIS")
print("="*60)

missing_train = train_df.isnull().sum()
missing_test = test_df.isnull().sum()

print("\nMissing values in TRAIN sample:")
if missing_train.sum() > 0:
    print(missing_train[missing_train > 0])
    print(f"\nMissing value percentages:")
    missing_pct = (missing_train / len(train_df)) * 100
    print(missing_pct[missing_pct > 0])
else:
    print("No missing values!")

print("\nMissing values in TEST sample:")
print(missing_test[missing_test > 0] if missing_test.sum() > 0 else "No missing values!")

# ============================================
# STEP 8: CHECK FOR DUPLICATES
# ============================================

print("\n" + "="*60)
print("DUPLICATE ANALYSIS")
print("="*60)

n_duplicates_train = train_df.duplicated().sum()
n_duplicates_test = test_df.duplicated().sum()

print(f"\nDuplicate rows in TRAIN sample: {n_duplicates_train}")
print(f"Duplicate rows in TEST sample: {n_duplicates_test}")

if n_duplicates_train > 0:
    print(f"   → {(n_duplicates_train/len(train_df)*100):.2f}% of train sample")

# ============================================
# STEP 9: BASIC STATISTICS FOR NUMERICAL COLUMNS
# ============================================

print("\n" + "="*60)
print("NUMERICAL STATISTICS")
print("="*60)

numerical_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()

if numerical_cols:
    print(train_df[numerical_cols].describe())
else:
    print("No numerical columns found (all are text/categorical)")

# ============================================
# STEP 10: TEXT/URL COLUMN ANALYSIS
# ============================================

print("\n" + "="*60)
print("COLUMN DETAILS (excluding HTML)")
print("="*60)

for col in train_df.columns:
    if col != 'html':  # Skip HTML for readability
        print(f"\nColumn: {col}")
        print(f"   Type: {train_df[col].dtype}")
        print(f"   Unique values: {train_df[col].nunique():,}")
        print(f"   Sample values:")
        for i, val in enumerate(train_df[col].head(3), 1):
            val_str = str(val)[:100] + "..." if len(str(val)) > 100 else str(val)
            print(f"      {i}. {val_str}")

# ============================================
# STEP 11: URL ANALYSIS
# ============================================

print("\n" + "="*60)
print("URL PATTERN ANALYSIS")
print("="*60)

if 'url' in train_df.columns:
    print("\nURL Statistics:")
    url_lengths = train_df['url'].str.len()
    print(f"   Average URL length: {url_lengths.mean():.1f} characters")
    print(f"   Max URL length: {url_lengths.max()} characters")
    print(f"   Min URL length: {url_lengths.min()} characters")
    
    # Check for HTTPS vs HTTP
    https_count = train_df['url'].str.contains('https://', case=False, na=False).sum()
    http_count = train_df['url'].str.contains('http://', case=False, na=False).sum()
    print(f"\nProtocol Distribution:")
    print(f"   HTTPS: {https_count} ({https_count/len(train_df)*100:.1f}%)")
    print(f"   HTTP: {http_count} ({http_count/len(train_df)*100:.1f}%)")
    
    # Show some phishing vs legitimate URL examples
    if target_col:
        print(f"\nSample PHISHING URLs:")
        phish_urls = train_df[train_df[target_col] == 'phish']['url'].head(3)
        for i, url in enumerate(phish_urls, 1):
            print(f"   {i}. {url}")
        
        print(f"\nSample LEGITIMATE URLs:")
        legit_urls = train_df[train_df[target_col] == 'benign']['url'].head(3)
        for i, url in enumerate(legit_urls, 1):
            print(f"   {i}. {url}")

# Convert date to datetime
train_df['date_dt'] = pd.to_datetime(train_df['date'])

# Check distribution by label
print("="*60)
print("DATE ANALYSIS")
print("="*60)

print("\nPhishing sites by month:")
print(train_df[train_df['label'] == 'phish']['date_dt'].dt.month.value_counts().sort_index())

print("\nBenign sites by month:")
print(train_df[train_df['label'] == 'benign']['date_dt'].dt.month.value_counts().sort_index())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

train_df[train_df['label'] == 'phish']['date_dt'].dt.month.value_counts().sort_index().plot(kind='bar', ax=axes[0], color='red')
axes[0].set_title('Phishing Sites by Month')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Count')

train_df[train_df['label'] == 'benign']['date_dt'].dt.month.value_counts().sort_index().plot(kind='bar', ax=axes[1], color='green')
axes[1].set_title('Benign Sites by Month')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

# ============================================
# STEP 12: SUMMARY
# ============================================

print("\n" + "="*60)
print("EDA SUMMARY")
print("="*60)
print(f"""
Dataset loaded successfully
Full dataset size: {len(dataset['train']) + len(dataset['test']):,} samples
Working with: {len(train_df) + len(test_df):,} samples for EDA
Features: {len(train_df.columns)}
Target column: {target_col if target_col else 'To be identified'}
Missing values: {'Yes' if missing_train.sum() > 0 else 'No'}
Duplicates: {'Yes' if n_duplicates_train > 0 else 'No'}
HTML column: ~{train_df['html'].str.len().mean()/1024:.0f} KB per sample (HUGE!)
""")


In [None]:
# Run this in your EDA notebook
print("="*60)
print("BRAND DISTRIBUTION ANALYSIS")
print("="*60)

# Get brand counts (excluding None)
brand_counts = train_df[train_df['target'].notna()]['target'].value_counts()

print(f"\nTotal unique brands: {len(brand_counts)}")
print(f"\nTop 20 brands:")
print(brand_counts.head(20))

print(f"\n📊 Coverage Analysis:")
top_5_coverage = brand_counts.head(5).sum() / brand_counts.sum() * 100
top_10_coverage = brand_counts.head(10).sum() / brand_counts.sum() * 100
top_20_coverage = brand_counts.head(20).sum() / brand_counts.sum() * 100

print(f"Top 5 brands cover: {top_5_coverage:.1f}% of phishing")
print(f"Top 10 brands cover: {top_10_coverage:.1f}% of phishing")
print(f"Top 20 brands cover: {top_20_coverage:.1f}% of phishing")

In [None]:
# ============================================
# TEST: Does language correlate with phishing?
# ============================================

print("="*60)
print("LANGUAGE vs PHISHING ANALYSIS")
print("="*60)

# Calculate phishing rate by language
lang_phishing = train_df.groupby('lang')['label'].apply(
    lambda x: (x == 'phish').sum() / len(x) * 100
).sort_values(ascending=False)

print("\n% Phishing by Language (Top 10):")
print(lang_phishing.head(10))

print(f"\n📊 Overall phishing rate: {(train_df['label'] == 'phish').sum() / len(train_df) * 100:.2f}%")

# Check if languages differ significantly
print("\n🔍 Do languages differ from overall rate?")
for lang, rate in lang_phishing.head(10).items():
    overall_rate = (train_df['label'] == 'phish').sum() / len(train_df) * 100
    diff = abs(rate - overall_rate)
    print(f"   {lang}: {rate:.1f}% (diff: {diff:.1f}%)")

# Visualize
plt.figure(figsize=(12, 6))
lang_phishing.head(15).plot(kind='barh')
plt.axvline((train_df['label'] == 'phish').sum() / len(train_df) * 100, 
            color='red', linestyle='--', label='Overall avg')
plt.xlabel('% Phishing')
plt.title('Phishing Rate by Language')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# ============================================
# COMPREHENSIVE HTML ANALYSIS - 5000 SAMPLES
# ============================================

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("="*60)
print("ANALYZING HTML PATTERNS - 5000 SAMPLES")
print("="*60)

# Take 5000 samples from train set
ANALYSIS_SIZE = 5000
analysis_df = train_df.head(ANALYSIS_SIZE).copy()

print(f"\nAnalyzing {len(analysis_df):,} samples...")
print(f"   Phishing: {(analysis_df['label'] == 'phish').sum():,}")
print(f"   Legitimate: {(analysis_df['label'] == 'benign').sum():,}")

# Function to extract HTML features
def extract_html_stats(html):
    """Extract HTML features for analysis"""
    if not html or len(html) == 0:
        return {
            'num_links': 0,
            'num_forms': 0,
            'num_input_fields': 0,
            'has_password_field': 0,
            'num_iframes': 0,
            'num_images': 0,
            'num_scripts': 0,
            'num_external_links': 0,
            'title_length': 0,
            'html_length': 0
        }
    
    try:
        soup = BeautifulSoup(html, 'html.parser')
        
        # Extract features
        all_links = soup.find_all('a', href=True)
        external_links = [link for link in all_links if link.get('href', '').startswith('http')]
        title_text = soup.title.string if soup.title else ''
        
        return {
            'num_links': len(soup.find_all('a')),
            'num_forms': len(soup.find_all('form')),
            'num_input_fields': len(soup.find_all('input')),
            'has_password_field': 1 if len(soup.find_all('input', {'type': 'password'})) > 0 else 0,
            'num_iframes': len(soup.find_all('iframe')),
            'num_images': len(soup.find_all('img')),
            'num_scripts': len(soup.find_all('script')),
            'num_external_links': len(external_links),
            'title_length': len(title_text) if title_text else 0,
            'html_length': len(html)
        }
    except:
        return {
            'num_links': 0,
            'num_forms': 0,
            'num_input_fields': 0,
            'has_password_field': 0,
            'num_iframes': 0,
            'num_images': 0,
            'num_scripts': 0,
            'num_external_links': 0,
            'title_length': 0,
            'html_length': 0
        }

# Extract HTML features for all samples
print("\nExtracting HTML features... (this may take a few minutes)")
html_features_list = []
for idx, row in analysis_df.iterrows():
    features = extract_html_stats(row['html'])
    features['label'] = row['label']
    html_features_list.append(features)
    
    if (idx + 1) % 1000 == 0:
        print(f"   Processed {idx + 1:,} samples...")

html_features_df = pd.DataFrame(html_features_list)

print("Feature extraction complete!")

# ============================================
# STATISTICAL COMPARISON
# ============================================

print("\n" + "="*60)
print("STATISTICAL COMPARISON: PHISHING vs LEGITIMATE")
print("="*60)

phishing_stats = html_features_df[html_features_df['label'] == 'phish'].describe()
legitimate_stats = html_features_df[html_features_df['label'] == 'benign'].describe()

feature_cols = [col for col in html_features_df.columns if col != 'label']

print("\nMEAN VALUES COMPARISON:\n")
comparison_df = pd.DataFrame({
    'Feature': feature_cols,
    'Phishing (mean)': [phishing_stats[col]['mean'] for col in feature_cols],
    'Legitimate (mean)': [legitimate_stats[col]['mean'] for col in feature_cols],
})
comparison_df['Difference'] = comparison_df['Legitimate (mean)'] - comparison_df['Phishing (mean)']
comparison_df['Ratio (Legit/Phish)'] = comparison_df['Legitimate (mean)'] / (comparison_df['Phishing (mean)'] + 0.001)

print(comparison_df.to_string(index=False))

# ============================================
# MEDIAN COMPARISON
# ============================================

print("\n" + "="*60)
print("MEDIAN VALUES COMPARISON:")
print("="*60)

comparison_median_df = pd.DataFrame({
    'Feature': feature_cols,
    'Phishing (median)': [phishing_stats[col]['50%'] for col in feature_cols],
    'Legitimate (median)': [legitimate_stats[col]['50%'] for col in feature_cols],
})
comparison_median_df['Difference'] = comparison_median_df['Legitimate (median)'] - comparison_median_df['Phishing (median)']

print(comparison_median_df.to_string(index=False))

# ============================================
# VISUALIZATIONS
# ============================================

print("\nCreating visualizations...")

fig, axes = plt.subplots(3, 4, figsize=(20, 15))
axes = axes.flatten()

for idx, feature in enumerate(feature_cols):
    ax = axes[idx]
    
    phishing_data = html_features_df[html_features_df['label'] == 'phish'][feature]
    legitimate_data = html_features_df[html_features_df['label'] == 'benign'][feature]
    
    # Create box plots
    box_data = [phishing_data, legitimate_data]
    bp = ax.boxplot(box_data, labels=['Phishing', 'Legitimate'], patch_artist=True)
    
    # Color the boxes
    bp['boxes'][0].set_facecolor('#e74c3c')
    bp['boxes'][1].set_facecolor('#2ecc71')
    
    ax.set_title(feature, fontsize=12, fontweight='bold')
    ax.set_ylabel('Count')
    ax.grid(True, alpha=0.3)

# Remove extra subplots if any
for idx in range(len(feature_cols), len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.savefig('html_features_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualization saved as 'html_features_comparison.png'")

# ============================================
# PASSWORD FIELD ANALYSIS
# ============================================

print("\n" + "="*60)
print("PASSWORD FIELD ANALYSIS:")
print("="*60)

phishing_with_password = (html_features_df[html_features_df['label'] == 'phish']['has_password_field'] == 1).sum()
legitimate_with_password = (html_features_df[html_features_df['label'] == 'benign']['has_password_field'] == 1).sum()

total_phishing = (html_features_df['label'] == 'phish').sum()
total_legitimate = (html_features_df['label'] == 'benign').sum()

print(f"Phishing sites with password field: {phishing_with_password}/{total_phishing} ({phishing_with_password/total_phishing*100:.1f}%)")
print(f"Legitimate sites with password field: {legitimate_with_password}/{total_legitimate} ({legitimate_with_password/total_legitimate*100:.1f}%)")

# ============================================
# CORRELATION ANALYSIS
# ============================================

print("\n" + "="*60)
print("CORRELATION WITH LABEL:")
print("="*60)

# Convert label to numeric (phish=1, benign=0)
html_features_df['label_numeric'] = (html_features_df['label'] == 'phish').astype(int)

correlations = html_features_df[feature_cols + ['label_numeric']].corr()['label_numeric'].sort_values(ascending=False)
print("\nFeature correlations with phishing label:")
print(correlations[:-1])  # Exclude label_numeric itself

print("\n" + "="*60)
print("ANALYSIS COMPLETE!")
print("="*60)
print("\nKEY INSIGHTS:")
print("- Features with HIGH positive correlation → More common in PHISHING")
print("- Features with HIGH negative correlation → More common in LEGITIMATE")
print("- Features close to 0 → NOT useful for classification")