In [None]:
import os, re, joblib
import numpy as np
import pandas as pd
from urllib.parse import urlparse
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# ---------------- CONFIG ----------------
DATA_FILE = '/Users/yahyamohnd/Downloads/Phishing_dataset_full_large.csv'
TEST_SIZE = 0.25
RANDOM_STATE = 42

# --------------- DATA LOADING ----------------
def load_and_prepare_data(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    print(f"Initial shape: {df.shape}")
    
    # Better column detection
    text_col = None
    for col in ['text', 'url', 'URL', 'link']:
        if col in df.columns:
            text_col = col
            break
    
    label_col = None
    for col in ['label', 'class', 'target']:
        if col in df.columns:
            label_col = col
            break
    
    if text_col is None or label_col is None:
        print(f"Available columns: {df.columns.tolist()}")
        raise ValueError("Could not find text and label columns")
    
    df = df[[text_col, label_col]].rename(columns={text_col: 'text', label_col: 'label'})
    df = df.dropna()
    df = df[df['text'].str.len() > 5]
    df = df.drop_duplicates(subset=['text'])
    
    print(f"Final shape: {df.shape}")
    print(f"Label distribution: {df['label'].value_counts().to_dict()}")
    return df

# --------------- IMPROVED PATTERN LEARNING ----------------
def learn_patterns_from_data(df):
    print("Learning patterns from data...")
    phishing_urls = df[df['label']==1]['text'].tolist()
    legitimate_urls = df[df['label']==0]['text'].tolist()
    
    print(f"Phishing URLs: {len(phishing_urls)}")
    print(f"Legitimate URLs: {len(legitimate_urls)}")

    patterns = {}
    
    # 1. Length patterns with better statistics
    phish_lengths = [len(u) for u in phishing_urls]
    legit_lengths = [len(u) for u in legitimate_urls]
    
    patterns['length_stats'] = {
        'phish_mean': np.mean(phish_lengths),
        'legit_mean': np.mean(legit_lengths),
        'phish_std': np.std(phish_lengths),
        'legit_std': np.std(legit_lengths),
        'phish_75th': np.percentile(phish_lengths, 75),
        'legit_75th': np.percentile(legit_lengths, 75),
        'phish_95th': np.percentile(phish_lengths, 95),
        'legit_95th': np.percentile(legit_lengths, 95),
    }
    
    # 2. Domain analysis - learn legitimate vs phishing domains
    phish_domains = []
    legit_domains = []
    
    for url in phishing_urls:
        try:
            if not url.startswith(('http://', 'https://')):
                url = 'http://' + url
            domain = urlparse(url).netloc.lower()
            if domain: phish_domains.append(domain)
        except: continue
    
    for url in legitimate_urls:
        try:
            if not url.startswith(('http://', 'https://')):
                url = 'http://' + url
            domain = urlparse(url).netloc.lower()
            if domain: legit_domains.append(domain)
        except: continue
    
    # Learn domain patterns (not hardcoded list)
    phish_domain_freq = Counter(phish_domains)
    legit_domain_freq = Counter(legit_domains)
    
    # Find domains that appear frequently in legitimate data
    learned_legit_domains = set()
    for domain, count in legit_domain_freq.most_common(100):
        if count >= 5:  # Must appear at least 5 times
            phish_count = phish_domain_freq.get(domain, 0)
            if phish_count == 0 or count / (phish_count + 1) > 10:  # Strong legitimate signal
                learned_legit_domains.add(domain)
    
    patterns['learned_legit_domains'] = learned_legit_domains
    print(f"Learned {len(learned_legit_domains)} legitimate domains from data")
    
    # 3. TLD patterns
    phish_tlds = [d.split('.')[-1] for d in phish_domains if '.' in d]
    legit_tlds = [d.split('.')[-1] for d in legit_domains if '.' in d]
    
    phish_tld_freq = Counter(phish_tlds)
    legit_tld_freq = Counter(legit_tlds)
    
    tld_suspicion_scores = {}
    for tld in set(phish_tlds + legit_tlds):
        phish_count = phish_tld_freq.get(tld, 0)
        legit_count = legit_tld_freq.get(tld, 0)
        
        if phish_count + legit_count >= 10:  # Minimum occurrences
            phish_rate = phish_count / len(phish_tlds) if phish_tlds else 0
            legit_rate = legit_count / len(legit_tlds) if legit_tlds else 0
            
            if legit_rate > 0:
                suspicion_score = phish_rate / legit_rate
                tld_suspicion_scores[tld] = suspicion_score
    
    patterns['tld_suspicion_scores'] = tld_suspicion_scores
    
    # 4. Character patterns with better filtering
    phish_char_stats = defaultdict(int)
    legit_char_stats = defaultdict(int)
    
    for url in phishing_urls:
        for c in url.lower(): phish_char_stats[c] += 1
    for url in legitimate_urls:
        for c in url.lower(): legit_char_stats[c] += 1
    
    suspicious_chars = {}
    total_phish = sum(phish_char_stats.values())
    total_legit = sum(legit_char_stats.values())
    
    for c in set(phish_char_stats.keys()) | set(legit_char_stats.keys()):
        phish_freq = phish_char_stats[c] / total_phish
        legit_freq = legit_char_stats.get(c, 1) / total_legit
        
        # More strict filtering - only characters that are significantly more common in phishing
        score = phish_freq / (legit_freq + 1e-10)
        if score > 2.0 and phish_char_stats[c] > 50:  # Higher thresholds
            suspicious_chars[c] = score
    
    patterns['suspicious_chars'] = suspicious_chars
    print(f"Found {len(suspicious_chars)} suspicious characters")
    
    # 5. N-gram patterns with better filtering
    def extract_ngrams(urls, n_range=(3, 6), max_samples=2000):
        ngrams = []
        for url in urls[:max_samples]:
            u = re.sub(r'https?://', '', url.lower())
            u = re.sub(r'[^a-z0-9.-]', '', u)
            for n in range(n_range[0], min(n_range[1] + 1, len(u))):
                for i in range(len(u) - n + 1):
                    ngram = u[i:i+n]
                    if len(set(ngram)) > 1:  # Not repetitive
                        ngrams.append(ngram)
        return ngrams
    
    phish_ngrams = extract_ngrams(phishing_urls)
    legit_ngrams = extract_ngrams(legitimate_urls)
    
    phish_counts = Counter(phish_ngrams)
    legit_counts = Counter(legit_ngrams)
    
    suspicious_ngrams = {}
    for ng, cnt in phish_counts.most_common(500):
        if cnt >= 10:  # Higher minimum frequency
            legit_cnt = legit_counts.get(ng, 0)
            phish_rate = cnt / len(phish_ngrams)
            legit_rate = legit_cnt / len(legit_ngrams) if legit_ngrams else 0
            
            if legit_rate == 0:
                score = phish_rate * 1000  # Very suspicious if only in phishing
            else:
                score = phish_rate / legit_rate
            
            # Much higher threshold for n-grams
            if score >= 5.0:  # At least 5x more common in phishing
                suspicious_ngrams[ng] = {
                    'score': score,
                    'phish_count': cnt,
                    'legit_count': legit_cnt
                }
    
    patterns['suspicious_ngrams'] = suspicious_ngrams
    print(f"Found {len(suspicious_ngrams)} suspicious n-grams")
    
    return patterns

# --------------- IMPROVED FEATURE EXTRACTION ----------------
def extract_improved_features(text, learned_patterns):
    features = {}
    t = text.lower()
    
    # 1. Length features with learned thresholds
    features['length'] = len(text)
    length_stats = learned_patterns.get('length_stats', {})
    if length_stats:
        # More sophisticated length features
        phish_mean = length_stats.get('phish_mean', 50)
        legit_mean = length_stats.get('legit_mean', 50)
        
        features['length_ratio_phish'] = len(text) / max(phish_mean, 10)
        features['length_ratio_legit'] = len(text) / max(legit_mean, 10)
        features['length_above_phish_95th'] = 1 if len(text) > length_stats.get('phish_95th', 200) else 0
        features['length_very_long'] = 1 if len(text) > 150 else 0
        features['length_very_short'] = 1 if len(text) < 20 else 0
    
    # 2. Domain legitimacy check (learned from data)
    try:
        if not text.startswith(('http://', 'https://')):
            text_parsed = 'http://' + text
        else:
            text_parsed = text
        
        parsed = urlparse(text_parsed)
        domain = parsed.netloc.lower() if parsed.netloc else ''
        
        learned_legit_domains = learned_patterns.get('learned_legit_domains', set())
        features['is_learned_legit_domain'] = 1 if domain in learned_legit_domains else 0
        
        # TLD suspicion based on learned patterns
        tld_scores = learned_patterns.get('tld_suspicion_scores', {})
        if '.' in domain:
            tld = domain.split('.')[-1]
            features['tld_suspicion_score'] = tld_scores.get(tld, 1.0)
            features['tld_very_suspicious'] = 1 if tld_scores.get(tld, 1.0) > 3.0 else 0
        else:
            features['tld_suspicion_score'] = 1.0
            features['tld_very_suspicious'] = 0
        
        # Domain structure
        features['subdomain_count'] = max(0, domain.count('.') - 1) if domain else 0
        features['domain_length'] = len(domain)
        features['has_dash_in_domain'] = 1 if '-' in domain else 0
        
    except:
        features.update({
            'is_learned_legit_domain': 0, 'tld_suspicion_score': 1.0,
            'tld_very_suspicious': 0, 'subdomain_count': 0,
            'domain_length': 0, 'has_dash_in_domain': 0
        })
    
    # 3. Improved character analysis
    suspicious_chars = learned_patterns.get('suspicious_chars', {})
    char_score = 0
    char_count = 0
    
    for c, score in suspicious_chars.items():
        count = t.count(c)
        if count > 0:
            char_score += count * score
            char_count += count
    
    features['suspicious_char_score'] = char_score
    features['suspicious_char_count'] = char_count
    
    # Better normalization for char ratio
    total_chars = len([c for c in text if c.isalnum() or c in '.-/'])
    features['suspicious_char_ratio'] = char_count / max(total_chars, 1)
    features['high_suspicious_char_ratio'] = 1 if features['suspicious_char_ratio'] > 0.3 else 0
    
    # 4. Improved n-gram analysis
    suspicious_ngrams = learned_patterns.get('suspicious_ngrams', {})
    t_clean = re.sub(r'https?://', '', t)
    t_clean = re.sub(r'[^a-z0-9.-]', '', t_clean)
    
    ngram_score = 0
    ngram_count = 0
    high_score_ngrams = 0
    
    for ng, info in suspicious_ngrams.items():
        if ng in t_clean:
            score = info['score']
            ngram_score += score
            ngram_count += 1
            if score > 20:  # Very high threshold
                high_score_ngrams += 1
    
    features['ngram_suspicion_score'] = ngram_score
    features['ngram_match_count'] = ngram_count
    features['very_suspicious_ngrams'] = high_score_ngrams
    
    # 5. URL structure features
    features['digit_count'] = sum(c.isdigit() for c in text)
    features['digit_ratio'] = features['digit_count'] / max(1, len(text))
    features['high_digit_ratio'] = 1 if features['digit_ratio'] > 0.3 else 0
    
    # Special characters with better analysis
    features['dash_count'] = text.count('-')
    features['dot_count'] = text.count('.')
    features['slash_count'] = text.count('/')
    features['equals_count'] = text.count('=')
    features['question_count'] = text.count('?')
    features['ampersand_count'] = text.count('&')
    
    # URL structure indicators
    features['has_query'] = 1 if '?' in text else 0
    features['has_fragment'] = 1 if '#' in text else 0
    features['many_params'] = 1 if text.count('=') > 3 else 0
    
    # 6. Entropy calculation (but normalized better)
    if t_clean and len(t_clean) > 0:
        counts = {}
        for c in t_clean:
            counts[c] = counts.get(c, 0) + 1
        
        entropy = 0
        for v in counts.values():
            p = v / len(t_clean)
            entropy += -p * np.log2(p)
        
        features['entropy'] = entropy
        features['char_diversity'] = len(counts) / len(t_clean)
        features['high_entropy'] = 1 if entropy > 4.5 else 0  # Higher threshold
    else:
        features['entropy'] = 0
        features['char_diversity'] = 0
        features['high_entropy'] = 0
    
    # 7. Phishing-specific patterns
    features['has_security_words'] = 1 if any(word in t for word in ['secure', 'verify', 'update', 'login', 'account']) else 0
    features['has_brand_imitation'] = 1 if any(brand in t for brand in ['paypal', 'apple', 'microsoft', 'google', 'amazon', 'facebook']) else 0
    features['suspicious_tld'] = 1 if any(tld in text for tld in ['.tk', '.ml', '.ga', '.cf', '.info', '.biz']) else 0
    
    return features

# --------------- MODEL TRAINING WITH EVALUATION ----------------
def train_improved_model(df, learned_patterns):
    print("Extracting improved features...")
    feature_data = [extract_improved_features(text, learned_patterns) for text in df['text'].values]
    features_df = pd.DataFrame(feature_data).fillna(0)
    
    print(f"Feature matrix shape: {features_df.shape}")
    print("Features:", list(features_df.columns))
    
    # Scale features
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(features_df.values)
    X = csr_matrix(X_scaled)
    y = df['label'].values
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )
    
    print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
    
    # Train model
    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=RANDOM_STATE,
        class_weight='balanced'  # Handle class imbalance
    )
    
    print("Training model...")
    model.fit(X_train, y_train)
    
    # Evaluate model
    print("\n" + "="*50)
    print("MODEL EVALUATION")
    print("="*50)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    print(f"Cross-validation ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Test set evaluation
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Test ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Phishing']))
    
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print(f"\nConfusion Matrix:")
    print(f"True Negatives (Correct Legitimate): {tn}")
    print(f"False Positives (Legitimate as Phishing): {fp}")  
    print(f"False Negatives (Phishing as Legitimate): {fn}")
    print(f"True Positives (Correct Phishing): {tp}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features_df.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Most Important Features:")
    for i, row in feature_importance.head(10).iterrows():
        print(f"{row['feature']}: {row['importance']:.4f}")
    
    return model, scaler, features_df.columns.tolist(), feature_importance

# --------------- BEHAVIOR ANALYSIS ----------------
def analyze_model_behavior(model, scaler, feature_columns, learned_patterns, df):
    """Analyze how the model behaves on different types of URLs"""
    print("\n" + "="*60)
    print("MODEL BEHAVIOR ANALYSIS")
    print("="*60)
    
    # Extract features for all URLs in dataset
    all_features = []
    for text in df['text'].values:
        features = extract_improved_features(text, learned_patterns)
        all_features.append(features)
    
    features_df = pd.DataFrame(all_features).fillna(0)
    features_df = features_df.reindex(columns=feature_columns, fill_value=0)
    
    X_scaled = scaler.transform(features_df.values)
    probabilities = model.predict_proba(X_scaled)[:, 1]
    
    # Add predictions to dataframe for analysis
    analysis_df = df.copy()
    analysis_df['predicted_prob'] = probabilities
    analysis_df['predicted_label'] = (probabilities >= 0.5).astype(int)
    analysis_df['correct'] = (analysis_df['predicted_label'] == analysis_df['label'])
    
    # 1. Confidence distribution analysis
    print("\n1. CONFIDENCE DISTRIBUTION:")
    high_conf = sum(1 for p in probabilities if p > 0.8 or p < 0.2)
    medium_conf = sum(1 for p in probabilities if 0.2 <= p <= 0.8)
    print(f"High confidence predictions (>0.8 or <0.2): {high_conf} ({high_conf/len(probabilities)*100:.1f}%)")
    print(f"Medium confidence predictions (0.2-0.8): {medium_conf} ({medium_conf/len(probabilities)*100:.1f}%)")
    
    # 2. Error analysis
    print("\n2. ERROR ANALYSIS:")
    false_positives = analysis_df[(analysis_df['label'] == 0) & (analysis_df['predicted_label'] == 1)]
    false_negatives = analysis_df[(analysis_df['label'] == 1) & (analysis_df['predicted_label'] == 0)]
    
    print(f"False Positives (Legitimate predicted as Phishing): {len(false_positives)}")
    print(f"False Negatives (Phishing predicted as Legitimate): {len(false_negatives)}")
    
    if len(false_positives) > 0:
        print("\nSample False Positives:")
        for i, (idx, row) in enumerate(false_positives.head(3).iterrows()):
            print(f"  {i+1}. {row['text']} (prob: {row['predicted_prob']:.3f})")
    
    if len(false_negatives) > 0:
        print("\nSample False Negatives:")
        for i, (idx, row) in enumerate(false_negatives.head(3).iterrows()):
            print(f"  {i+1}. {row['text']} (prob: {row['predicted_prob']:.3f})")
    
    # 3. Feature impact analysis
    print("\n3. FEATURE IMPACT ON DIFFERENT URL TYPES:")
    
    # Analyze feature values for different groups
    legit_features = features_df[analysis_df['label'] == 0]
    phish_features = features_df[analysis_df['label'] == 1]
    
    print("\nAverage feature values:")
    print(f"{'Feature':<25} {'Legitimate':<12} {'Phishing':<12} {'Difference':<12}")
    print("-" * 65)
    
    for col in feature_columns[:10]:  # Top 10 features
        legit_mean = legit_features[col].mean()
        phish_mean = phish_features[col].mean()
        diff = phish_mean - legit_mean
        print(f"{col:<25} {legit_mean:<12.3f} {phish_mean:<12.3f} {diff:<12.3f}")
    
    # 4. Length behavior analysis
    print("\n4. URL LENGTH BEHAVIOR:")
    legit_lengths = [len(url) for url in analysis_df[analysis_df['label'] == 0]['text']]
    phish_lengths = [len(url) for url in analysis_df[analysis_df['label'] == 1]['text']]
    
    print(f"Legitimate URLs - Mean: {np.mean(legit_lengths):.1f}, Std: {np.std(legit_lengths):.1f}")
    print(f"Phishing URLs - Mean: {np.mean(phish_lengths):.1f}, Std: {np.std(phish_lengths):.1f}")
    
    # Length-based accuracy
    short_urls = analysis_df[analysis_df['text'].str.len() < 50]
    medium_urls = analysis_df[(analysis_df['text'].str.len() >= 50) & (analysis_df['text'].str.len() < 100)]
    long_urls = analysis_df[analysis_df['text'].str.len() >= 100]
    
    if len(short_urls) > 0:
        short_acc = short_urls['correct'].mean()
        print(f"Accuracy on short URLs (<50 chars): {short_acc:.3f} ({len(short_urls)} samples)")
    
    if len(medium_urls) > 0:
        medium_acc = medium_urls['correct'].mean()
        print(f"Accuracy on medium URLs (50-100 chars): {medium_acc:.3f} ({len(medium_urls)} samples)")
    
    if len(long_urls) > 0:
        long_acc = long_urls['correct'].mean()
        print(f"Accuracy on long URLs (>100 chars): {long_acc:.3f} ({len(long_urls)} samples)")
    
    # 5. Domain behavior analysis
    print("\n5. DOMAIN-BASED BEHAVIOR:")
    learned_legit_domains = learned_patterns.get('learned_legit_domains', set())
    
    # URLs with learned legitimate domains
    legit_domain_mask = []
    for url in analysis_df['text']:
        try:
            if not url.startswith(('http://', 'https://')):
                url = 'http://' + url
            domain = urlparse(url).netloc.lower()
            legit_domain_mask.append(domain in learned_legit_domains)
        except:
            legit_domain_mask.append(False)
    
    analysis_df['has_learned_legit_domain'] = legit_domain_mask
    legit_domain_urls = analysis_df[analysis_df['has_learned_legit_domain']]
    
    if len(legit_domain_urls) > 0:
        legit_domain_acc = legit_domain_urls['correct'].mean()
        print(f"Accuracy on URLs with learned legitimate domains: {legit_domain_acc:.3f} ({len(legit_domain_urls)} samples)")
        
        # How often are learned legitimate domains correctly classified as legitimate?
        true_legit_with_legit_domain = legit_domain_urls[legit_domain_urls['label'] == 0]
        if len(true_legit_with_legit_domain) > 0:
            legit_recognition_rate = (true_legit_with_legit_domain['predicted_label'] == 0).mean()
            print(f"Recognition rate for legitimate URLs with learned domains: {legit_recognition_rate:.3f}")
    
    # 6. Pattern matching behavior
    print("\n6. PATTERN MATCHING BEHAVIOR:")
    suspicious_ngrams = learned_patterns.get('suspicious_ngrams', {})
    
    if suspicious_ngrams:
        pattern_matches = []
        for url in analysis_df['text']:
            url_clean = re.sub(r'https?://', '', url.lower())
            url_clean = re.sub(r'[^a-z0-9.-]', '', url_clean)
            
            match_count = 0
            for pattern in suspicious_ngrams.keys():
                if pattern in url_clean:
                    match_count += 1
            pattern_matches.append(match_count)
        
        analysis_df['pattern_matches'] = pattern_matches
        
        # URLs with many pattern matches
        high_pattern_urls = analysis_df[analysis_df['pattern_matches'] > 5]
        if len(high_pattern_urls) > 0:
            high_pattern_acc = high_pattern_urls['correct'].mean()
            print(f"Accuracy on URLs with >5 suspicious patterns: {high_pattern_acc:.3f} ({len(high_pattern_urls)} samples)")
        
        # URLs with no pattern matches
        no_pattern_urls = analysis_df[analysis_df['pattern_matches'] == 0]
        if len(no_pattern_urls) > 0:
            no_pattern_acc = no_pattern_urls['correct'].mean()
            print(f"Accuracy on URLs with no suspicious patterns: {no_pattern_acc:.3f} ({len(no_pattern_urls)} samples)")
    
    # 7. Probability threshold analysis
    print("\n7. THRESHOLD SENSITIVITY ANALYSIS:")
    thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
    
    print(f"{'Threshold':<12} {'Accuracy':<12} {'Precision':<12} {'Recall':<12}")
    print("-" * 50)
    
    for threshold in thresholds:
        pred_at_threshold = (probabilities >= threshold).astype(int)
        accuracy = accuracy_score(analysis_df['label'], pred_at_threshold)
        
        tp = sum((pred_at_threshold == 1) & (analysis_df['label'] == 1))
        fp = sum((pred_at_threshold == 1) & (analysis_df['label'] == 0))
        fn = sum((pred_at_threshold == 0) & (analysis_df['label'] == 1))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        print(f"{threshold:<12} {accuracy:<12.3f} {precision:<12.3f} {recall:<12.3f}")
    
    return analysis_df

# --------------- IMPROVED PREDICTION ----------------
def predict_url_improved(url, model, scaler, feature_columns, learned_patterns):
    features = extract_improved_features(url, learned_patterns)
    
    # Convert to DataFrame to ensure correct column order
    features_df = pd.DataFrame([features])
    features_df = features_df.reindex(columns=feature_columns, fill_value=0)
    
    X_scaled = scaler.transform(features_df.values)
    y_prob = model.predict_proba(X_scaled)[0, 1]
    y_pred = 1 if y_prob >= 0.5 else 0
    
    # Generate more intelligent reasons
    reasons = []
    
    if features.get('is_learned_legit_domain', 0) == 1:
        reasons.append("Domain recognized as legitimate from training data")
    
    if features.get('tld_very_suspicious', 0) == 1:
        reasons.append(f"TLD has high suspicion score: {features.get('tld_suspicion_score', 0):.2f}")
    
    if features.get('very_suspicious_ngrams', 0) > 0:
        reasons.append(f"{features.get('very_suspicious_ngrams', 0)} very suspicious patterns detected")
    elif features.get('ngram_match_count', 0) > 5:
        reasons.append(f"{features.get('ngram_match_count', 0)} suspicious patterns found")
    
    if features.get('high_suspicious_char_ratio', 0) == 1:
        reasons.append(f"High suspicious character ratio: {features.get('suspicious_char_ratio', 0):.2f}")
    
    if features.get('length_very_long', 0) == 1:
        reasons.append(f"Unusually long URL ({features.get('length', 0)} chars)")
    
    if features.get('has_security_words', 0) == 1:
        reasons.append("Contains security-related words (common in phishing)")
    
    if features.get('has_brand_imitation', 0) == 1:
        reasons.append("Contains brand names (possible imitation)")
    
    if features.get('many_params', 0) == 1:
        reasons.append("Many parameters in URL")
    
    return {
        'url': url,
        'prediction': 'PHISHING' if y_pred == 1 else 'LEGITIMATE',
        'probability': y_prob,
        'confidence': 'HIGH' if abs(y_prob - 0.5) > 0.3 else 'MEDIUM' if abs(y_prob - 0.5) > 0.1 else 'LOW',
        'reasons': reasons if reasons else ['Based on learned patterns from training data']
    }

# --------------- MAIN ----------------
def main():
    try:
        print("="*60)
        print("IMPROVED PHISHING DETECTION SYSTEM")
        print("Learning patterns purely from dataset...")
        print("="*60)
        
        # Load data
        df = load_and_prepare_data(DATA_FILE)
        
        # Learn patterns
        learned_patterns = learn_patterns_from_data(df)
        
        # Train model
        model, scaler, feature_columns, feature_importance = train_improved_model(df, learned_patterns)
        
        print(f"\n{'='*60}")
        print("INTERACTIVE TESTING")
        print("Enter URLs to test (type 'exit' to quit):")
        print("="*60)
        
        while True:
            user_url = input("\nURL: ").strip()
            if user_url.lower() == "exit":
                break
            
            if not user_url:
                continue
                
            result = predict_url_improved(user_url, model, scaler, feature_columns, learned_patterns)
            print(f"\nResult: {result['prediction']} ({result['confidence']} confidence)")
            print(f"Probability: {result['probability']:.3f}")
            print("Reasons:")
            for i, reason in enumerate(result['reasons'], 1):
                print(f"  {i}. {reason}")
            print("-" * 50)
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

IMPROVED PHISHING DETECTION SYSTEM
Learning patterns purely from dataset...
Loading dataset...
Initial shape: (200000, 18)
Final shape: (60054, 2)
Label distribution: {1: 43918, 0: 16136}
Learning patterns from data...
Phishing URLs: 43918
Legitimate URLs: 16136
Learned 28 legitimate domains from data
Found 13 suspicious characters
Found 6 suspicious n-grams
Extracting improved features...
Feature matrix shape: (60054, 37)
Features: ['length', 'length_ratio_phish', 'length_ratio_legit', 'length_above_phish_95th', 'length_very_long', 'length_very_short', 'is_learned_legit_domain', 'tld_suspicion_score', 'tld_very_suspicious', 'subdomain_count', 'domain_length', 'has_dash_in_domain', 'suspicious_char_score', 'suspicious_char_count', 'suspicious_char_ratio', 'high_suspicious_char_ratio', 'ngram_suspicion_score', 'ngram_match_count', 'very_suspicious_ngrams', 'digit_count', 'digit_ratio', 'high_digit_ratio', 'dash_count', 'dot_count', 'slash_count', 'equals_count', 'question_count', 'amper


URL:  http://www.bankofamerica-login-update.com



Result: PHISHING (HIGH confidence)
Probability: 1.000
Reasons:
  1. Contains security-related words (common in phishing)
--------------------------------------------------



URL:  https://amazon-account-update.top



Result: PHISHING (HIGH confidence)
Probability: 0.958
Reasons:
  1. Contains security-related words (common in phishing)
  2. Contains brand names (possible imitation)
--------------------------------------------------



URL:  http://facebook.verify-user-login.xyz



Result: PHISHING (HIGH confidence)
Probability: 0.967
Reasons:
  1. Contains security-related words (common in phishing)
  2. Contains brand names (possible imitation)
--------------------------------------------------



URL:  http://secure-login-google.com/signin



Result: PHISHING (HIGH confidence)
Probability: 0.984
Reasons:
  1. Contains security-related words (common in phishing)
  2. Contains brand names (possible imitation)
--------------------------------------------------



URL:  https://www.amazon.com/product/B09XYZ



Result: LEGITIMATE (HIGH confidence)
Probability: 0.003
Reasons:
  1. Domain recognized as legitimate from training data
  2. Contains brand names (possible imitation)
--------------------------------------------------
