In [None]:
import os, html, re, joblib, json, hashlib
import numpy as np
import pandas as pd
from urllib.parse import urlparse, parse_qs
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except ImportError:
    from sklearn.ensemble import RandomForestClassifier
    XGB_AVAILABLE = False

# ---------------- ENHANCED CONFIG ----------------
DATA_FILE = '/Users/yahyamohnd/Downloads/Phishing_dataset_full_large.csv'
OUT_MODEL = 'enhanced_phish_model.pkl'
OUT_VECT_WORD = 'tfidf_word_enhanced.pkl'
OUT_VECT_CHAR = 'tfidf_char_enhanced.pkl'
OUT_SCALER = 'url_scaler_enhanced.pkl'
OUT_SUSPICIOUS_DOMAINS = 'suspicious_domains.pkl'
TEST_SIZE = 0.3
RANDOM_STATE = 42

# Suspicious keywords and patterns
SUSPICIOUS_KEYWORDS = [
    'login', 'verify', 'account', 'suspended', 'click', 'urgent', 'secure',
    'update', 'confirm', 'bank', 'paypal', 'amazon', 'apple', 'microsoft',
    'google', 'facebook', 'twitter', 'instagram', 'netflix', 'ebay',
    'winner', 'prize', 'lottery', 'claim', 'free', 'offer', 'limited',
    'expires', 'act now', 'immediate', 'warning', 'alert', 'notice'
]

SUSPICIOUS_DOMAINS = [
    'bit.ly', 'tinyurl.com', 'short.link', 'rebrandly.com', 't.co',
    'ow.ly', 'buff.ly', 'soo.gd', 'cutt.ly', 'linktr.ee', 'is.gd',
    'goo.gl', 'tiny.cc', 'ur1.ca'
]

# Known suspicious/invalid TLDs and patterns
INVALID_TLDS = {
    'csa', 'fd', 'dfs', 'xyz123', 'fake', 'test', 'invalid', 'spam',
    'phish', 'scam', 'hack', 'malware', 'virus', 'temp', 'tmp'
}

LEGITIMATE_TLDS = {
    'com': 1000, 'org': 800, 'net': 700, 'edu': 900, 'gov': 950,
    'mil': 950, 'int': 850, 'co': 600, 'io': 400, 'ly': 200,
    'me': 300, 'tv': 250, 'info': 150, 'biz': 100, 'name': 50,
    'ca': 800, 'uk': 800, 'de': 750, 'fr': 750, 'jp': 700,
    'au': 650, 'br': 600, 'cn': 500, 'ru': 400, 'in': 400
}

# Suspicious patterns in URLs
SUSPICIOUS_URL_PATTERNS = [
    r'[0-9]+[a-z]+[0-9]+',  # Mixed numbers and letters randomly
    r'[a-z]{1,3}[0-9]{3,}',  # Short letters followed by many numbers
    r'[0-9]{2,}[a-z]{1,2}[0-9]{2,}',  # Numbers-letters-numbers pattern
    r'@.*\.',  # @ symbol in domain (invalid)
    r'-{2,}',  # Multiple consecutive hyphens
    r'[a-z]{10,}[0-9]{3,}',  # Long random string + numbers
]

# --------------- ENHANCED DATA LOADING ----------------
def load_and_validate_data(file_path):
    """Load and validate the dataset"""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found: {file_path}")
    
    df = pd.read_csv(file_path)
    
    # Auto-detect text column
    text_column = None
    for col in ['text', 'url', 'URL', 'link', 'website']:
        if col in df.columns:
            text_column = col
            break
    
    if text_column is None:
        # Find first object column
        text_cols = [c for c in df.columns if df[c].dtype == object]
        if len(text_cols) > 0:
            text_column = text_cols[0]
        else:
            raise ValueError("No text column found")
    
    df['text'] = df[text_column].astype(str)
    
    # Auto-detect label column
    label_column = None
    for col in ['label', 'class', 'target', 'is_phishing', 'phishing']:
        if col in df.columns:
            label_column = col
            break
    
    if label_column is None:
        raise ValueError("No label column found. Expected: 'label', 'class', 'target', 'is_phishing', or 'phishing'")
    
    df['label'] = df[label_column].astype(int)
    
    # Remove duplicates and invalid entries
    df = df.drop_duplicates(subset=['text'])
    df = df[df['text'].str.len() > 3]  # Remove very short URLs
    
    print(f"Loaded {len(df)} samples from {text_column}")
    print(f"Label distribution: {df['label'].value_counts().to_dict()}")
    
    return df

# --------------- ENHANCED TEXT PREPROCESSING ----------------
def advanced_clean_text(text):
    """Enhanced text cleaning with more sophisticated preprocessing"""
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text).strip()
    original_text = text
    
    # HTML decode
    text = html.unescape(text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # Normalize URLs
    text = re.sub(r'https?://', 'http://', text)
    text = re.sub(r'www\.', '', text)
    
    # Extract and normalize special patterns
    text = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '<IP>', text)  # IP addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '<EMAIL>', text)  # Emails
    text = re.sub(r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', '<CARD>', text)  # Credit card patterns
    text = re.sub(r'\b\d{3,}\b', '<NUM>', text)  # Numbers
    text = re.sub(r'[^\w\s\-\.\/\:]', ' ', text)  # Keep URL-relevant chars
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip().lower()
    
    return text

def extract_domain_reputation_features(url):
    """Extract domain reputation and behavioral features"""
    features = {}
    
    try:
        parsed = urlparse(url if '://' in url else 'http://' + url)
        domain = parsed.netloc.lower()
        
        # Domain age simulation (in real implementation, use WHOIS)
        domain_hash = int(hashlib.md5(domain.encode()).hexdigest()[:8], 16)
        features['domain_age_days'] = (domain_hash % 3650) + 1  # 1-10 years simulation
        
        # Check against suspicious domains (URL shorteners only)
        features['is_url_shortener'] = 1 if any(susp == domain or domain.endswith('.' + susp) for susp in SUSPICIOUS_DOMAINS) else 0
        
        # Homograph detection (simplified)
        features['has_homograph'] = 1 if any(ord(c) > 127 for c in domain) else 0
        
        # Invalid TLD detection
        tld = domain.split('.')[-1] if '.' in domain else domain
        features['has_invalid_tld'] = 1 if tld in INVALID_TLDS else 0
        
        # Suspicious pattern detection
        features['matches_suspicious_pattern'] = 0
        for pattern in SUSPICIOUS_URL_PATTERNS:
            if re.search(pattern, domain):
                features['matches_suspicious_pattern'] = 1
                break
        
        # Random string detection (entropy-based)
        if len(domain) > 5:
            # Calculate character frequency distribution
            char_freq = {}
            for char in domain.replace('.', '').replace('-', ''):
                char_freq[char] = char_freq.get(char, 0) + 1
            
            # High entropy suggests random generation
            if len(char_freq) > 8 and max(char_freq.values()) < 3:
                features['high_entropy_domain'] = 1
            else:
                features['high_entropy_domain'] = 0
        else:
            features['high_entropy_domain'] = 0
        
        # Brand impersonation detection - FIXED
        popular_brands = ['paypal', 'amazon', 'google', 'microsoft', 'apple', 'facebook', 'netflix', 'bank']
        features['brand_impersonation'] = 0
        
        for brand in popular_brands:
            if brand in domain:
                # Define official domains for each brand
                official_domains = []
                if brand == 'google':
                    official_domains = ['google.com', 'gmail.com', 'googleusercontent.com', 'gstatic.com', 'youtube.com']
                elif brand == 'apple':
                    official_domains = ['apple.com', 'icloud.com', 'me.com', 'mac.com']
                elif brand == 'microsoft':
                    official_domains = ['microsoft.com', 'outlook.com', 'hotmail.com', 'live.com', 'msn.com', 'office.com']
                elif brand == 'amazon':
                    official_domains = ['amazon.com', 'amazonaws.com', 'cloudfront.net']
                elif brand == 'paypal':
                    official_domains = ['paypal.com', 'paypalobjects.com']
                elif brand == 'facebook':
                    official_domains = ['facebook.com', 'instagram.com', 'whatsapp.com']
                elif brand == 'netflix':
                    official_domains = ['netflix.com', 'nflxext.com', 'nflximg.net']
                else:
                    official_domains = [f'{brand}.com']
                
                # Check if it's an official domain (including subdomains)
                is_official = False
                for official in official_domains:
                    if domain == official or domain == f'www.{official}' or domain.endswith(f'.{official}'):
                        is_official = True
                        break
                
                # Only flag as impersonation if it's NOT an official domain
                if not is_official:
                    # Check for impersonation patterns
                    impersonation_patterns = [f'{brand}-', f'-{brand}', f'secure{brand}', f'{brand}secure', 
                                            f'login{brand}', f'{brand}login', f'{brand}account', f'verify{brand}']
                    if any(pattern in domain for pattern in impersonation_patterns):
                        features['brand_impersonation'] = 1
                        break
        
        # Invalid characters in domain
        features['has_invalid_chars'] = 1 if '@' in domain or any(c in domain for c in ['<', '>', '"', "'", '\\']) else 0
        
        return features
    
    except:
        return {
            'domain_age_days': 1, 'is_url_shortener': 0, 'has_homograph': 0, 'brand_impersonation': 0,
            'has_invalid_tld': 0, 'matches_suspicious_pattern': 0, 'high_entropy_domain': 0, 'has_invalid_chars': 0
        }

# --------------- COMPREHENSIVE URL FEATURES ----------------
def extract_comprehensive_url_features(text):
    """Extract comprehensive URL features with enhanced detection"""
    
    # Check if text looks like a URL
    is_url = any([
        'http' in text.lower(),
        'www.' in text.lower(),
        '://' in text,
        text.count('.') >= 1 and len(text.split()) == 1,
        re.match(r'^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text.strip())
    ])
    
    if not is_url:
        # Return default values for non-URLs
        return {
            'url_length': len(text), 'has_ip_address': 0, 'dot_count': text.count('.'),
            'https_flag': 0, 'url_entropy': 0, 'token_count': len(text.split()),
            'subdomain_count': 0, 'query_param_count': 0, 'tld_length': 0,
            'path_length': 0, 'has_hyphen_in_domain': 0, 'number_of_digits': sum(c.isdigit() for c in text),
            'tld_popularity': 10, 'suspicious_file_extension': 0, 'domain_name_length': len(text),
            'percentage_numeric_chars': (sum(c.isdigit() for c in text) / max(1, len(text))) * 100,
            'suspicious_keyword_count': sum(1 for kw in SUSPICIOUS_KEYWORDS if kw in text.lower()),
            'domain_age_days': 1, 'is_url_shortener': 0, 'has_homograph': 0, 'brand_impersonation': 0,
            'has_invalid_tld': 0, 'matches_suspicious_pattern': 0, 'high_entropy_domain': 0, 'has_invalid_chars': 0,
            'path_depth': 0, 'has_port': 0, 'fragment_length': 0, 'vowel_consonant_ratio': 0
        }
    
    # Parse URL
    url = text if '://' in text else 'http://' + text
    try:
        parsed = urlparse(url)
    except:
        parsed = urlparse('http://example.com')  # Fallback
    
    netloc = parsed.netloc or ''
    path = parsed.path or ''
    query = parsed.query or ''
    fragment = parsed.fragment or ''
    
    # Basic features
    url_length = len(url)
    
    # IP address detection
    ip_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
    has_ip_address = 1 if re.search(ip_pattern, netloc) else 0
    
    # Domain analysis
    dot_count = netloc.count('.')
    domain_parts = netloc.split('.')
    subdomain_count = max(0, len(domain_parts) - 2) if len(domain_parts) >= 2 else 0
    
    # TLD analysis
    tld = domain_parts[-1] if domain_parts else ''
    tld_length = len(tld)
    tld_popularity = LEGITIMATE_TLDS.get(tld, 50)  # Default to 50 instead of 10 for unknown but potentially valid TLDs
    
    # Security features
    https_flag = 1 if parsed.scheme == 'https' else 0
    has_port = 1 if ':' in netloc and not netloc.endswith(':80') and not netloc.endswith(':443') else 0
    
    # Complexity features
    unique_chars = len(set(url))
    url_entropy = (unique_chars / max(1, len(url))) * 10
    
    # Path analysis
    path_segments = [seg for seg in path.split('/') if seg]
    path_depth = len(path_segments)
    path_length = len(path)
    
    # Query parameters
    query_params = parse_qs(query)
    query_param_count = len(query_params)
    
    # Suspicious patterns
    has_hyphen_in_domain = 1 if '-' in netloc else 0
    number_of_digits = sum(c.isdigit() for c in url)
    percentage_numeric_chars = (number_of_digits / max(1, len(url))) * 100
    
    # File extension check
    suspicious_extensions = ['.exe', '.zip', '.rar', '.bat', '.scr', '.php', '.asp', '.jsp']
    suspicious_file_extension = 1 if any(url.lower().endswith(ext) for ext in suspicious_extensions) else 0
    
    # Content analysis
    suspicious_keyword_count = sum(1 for keyword in SUSPICIOUS_KEYWORDS if keyword in url.lower())
    
    # Token analysis
    all_tokens = path_segments + list(query_params.keys()) + netloc.split('.')
    token_count = len([t for t in all_tokens if t])
    
    # Linguistic features
    letters = [c for c in netloc if c.isalpha()]
    vowels = sum(1 for c in letters if c.lower() in 'aeiou')
    consonants = len(letters) - vowels
    vowel_consonant_ratio = vowels / max(1, consonants)
    
    # Domain reputation features
    reputation_features = extract_domain_reputation_features(url)
    
    # Combine all features
    features = {
        'url_length': url_length,
        'has_ip_address': has_ip_address,
        'dot_count': dot_count,
        'https_flag': https_flag,
        'url_entropy': url_entropy,
        'token_count': token_count,
        'subdomain_count': subdomain_count,
        'query_param_count': query_param_count,
        'tld_length': tld_length,
        'path_length': path_length,
        'has_hyphen_in_domain': has_hyphen_in_domain,
        'number_of_digits': number_of_digits,
        'tld_popularity': tld_popularity,
        'suspicious_file_extension': suspicious_file_extension,
        'domain_name_length': len(netloc),
        'percentage_numeric_chars': percentage_numeric_chars,
        'suspicious_keyword_count': suspicious_keyword_count,
        'path_depth': path_depth,
        'has_port': has_port,
        'fragment_length': len(fragment),
        'vowel_consonant_ratio': vowel_consonant_ratio,
        **reputation_features
    }
    
    return features

# --------------- MAIN PROCESSING ----------------
def main():
    # Load data
    df = load_and_validate_data(DATA_FILE)
    
    # Enhanced text preprocessing
    print("Applying enhanced text preprocessing...")
    df['text_cleaned'] = df['text'].apply(advanced_clean_text)
    
    # Extract comprehensive URL features
    print("Extracting comprehensive URL features...")
    url_features_list = []
    for text in df['text'].values:
        url_features_list.append(extract_comprehensive_url_features(text))
    
    url_features_df = pd.DataFrame(url_features_list)
    feature_columns = list(url_features_df.columns)
    
    # Combine with original dataframe
    for col in feature_columns:
        df[col] = url_features_df[col].values
    
    # Enhanced TF-IDF vectorization
    print("Creating enhanced TF-IDF features...")
    
    # Word-level TF-IDF with custom preprocessing
    vect_word = TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 3),  # Include trigrams
        max_features=5000,
        min_df=2,
        max_df=0.95,
        stop_words=None,  # Keep all words for URLs
        lowercase=True,
        token_pattern=r'\b\w+\b'
    )
    
    # Character-level TF-IDF
    vect_char = TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(2, 6),  # Wider range
        max_features=4000,
        min_df=2,
        max_df=0.95
    )
    
    # Fit vectorizers
    X_word = vect_word.fit_transform(df['text_cleaned'])
    X_char = vect_char.fit_transform(df['text_cleaned'])
    
    # Scale URL features using RobustScaler (better for outliers)
    scaler = RobustScaler()
    X_url_scaled = scaler.fit_transform(df[feature_columns].values)
    
    # Combine all features
    X = hstack([X_word, X_char, csr_matrix(X_url_scaled)])
    y = df['label'].values
    
    print(f"Feature matrix shape: {X.shape}")
    print(f"Label distribution: {np.bincount(y)}")
    
    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )
    
    # Enhanced model training
    print("Training enhanced model...")
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    scale_pos_weight = class_weights[1] / class_weights[0]
    
    if XGB_AVAILABLE:
        # XGBoost with hyperparameter tuning
        xgb_params = {
            'n_estimators': 500,
            'max_depth': 8,
            'learning_rate': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'scale_pos_weight': scale_pos_weight,
            'random_state': RANDOM_STATE,
            'eval_metric': 'logloss',
            'use_label_encoder': False,
            'n_jobs': -1
        }
        
        clf = XGBClassifier(**xgb_params)
        
    else:
        # Random Forest as fallback
        clf = RandomForestClassifier(
            n_estimators=500,
            max_depth=20,
            class_weight='balanced',
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
    
    # Fit model
    clf.fit(X_train, y_train)
    
    # Evaluation
    print("\nModel Evaluation:")
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else None
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    if y_prob is not None:
        print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    # Save models
    print("\nSaving models...")
    joblib.dump(clf, OUT_MODEL)
    joblib.dump(vect_word, OUT_VECT_WORD)
    joblib.dump(vect_char, OUT_VECT_CHAR)
    joblib.dump(scaler, OUT_SCALER)
    
    # Save feature columns and suspicious domains
    joblib.dump(feature_columns, 'feature_columns.pkl')
    joblib.dump(SUSPICIOUS_DOMAINS, OUT_SUSPICIOUS_DOMAINS)
    
    print("Models saved successfully!")
    
    # Enhanced prediction function
    def enhanced_classify_input(text, threshold=0.3):  # Lower threshold for better detection
        """Enhanced classification with comprehensive feature extraction"""
        
        # Preprocess text
        text_cleaned = advanced_clean_text(text)
        
        # Extract features
        url_features = extract_comprehensive_url_features(text)
        
        # RULE-BASED SUSPICIOUS CHECKS (Override ML if clearly malicious)
        high_risk_score = 0
        suspicious_flags = []
        
        # Check for invalid TLD
        if url_features.get('has_invalid_tld', 0):
            high_risk_score += 50
            suspicious_flags.append("Invalid/suspicious TLD detected")
        
        # Check for suspicious patterns
        if url_features.get('matches_suspicious_pattern', 0):
            high_risk_score += 40
            suspicious_flags.append("Suspicious character pattern detected")
        
        # Check for high entropy (random-looking domain)
        if url_features.get('high_entropy_domain', 0):
            high_risk_score += 30
            suspicious_flags.append("Random-looking domain name")
        
        # Check for invalid characters
        if url_features.get('has_invalid_chars', 0):
            high_risk_score += 60
            suspicious_flags.append("Invalid characters in URL (@ symbol detected)")
        
        # WHITELIST CHECK - Skip aggressive checks for known legitimate domains
        known_safe_domains = [
            'microsoft.com', 'apple.com', 'google.com', 'amazon.com', 'paypal.com',
            'facebook.com', 'instagram.com', 'twitter.com', 'linkedin.com', 'netflix.com',
            'github.com', 'stackoverflow.com', 'wikipedia.org', 'reddit.com', 'youtube.com',
            'gmail.com', 'outlook.com', 'hotmail.com', 'yahoo.com', 'icloud.com'
        ]
        
        is_whitelisted_domain = False
        parsed_for_whitelist = urlparse(text if '://' in text else 'http://' + text)
        domain_for_whitelist = parsed_for_whitelist.netloc.lower().replace('www.', '')
        
        for safe_domain in known_safe_domains:
            if domain_for_whitelist == safe_domain or domain_for_whitelist.endswith('.' + safe_domain):
                is_whitelisted_domain = True
                break
        
        # If it's a whitelisted domain, skip most aggressive checks
        if is_whitelisted_domain:
            high_risk_score = max(0, high_risk_score - 50)  # Reduce false positive score
            # Remove false flags for whitelisted domains
            suspicious_flags = [flag for flag in suspicious_flags if 'brand impersonation' not in flag.lower() and 'url shortener' not in flag.lower()]
        
        # Check for URL shortener (only for non-whitelisted domains)
        # Check for URL shortener (only for non-whitelisted domains)
        if not is_whitelisted_domain and url_features.get('is_url_shortener', 0):
            high_risk_score += 25
            suspicious_flags.append("URL shortener detected")
        
        # Check for brand impersonation (only for non-whitelisted domains)
        if not is_whitelisted_domain and url_features.get('brand_impersonation', 0):
            high_risk_score += 45
            suspicious_flags.append("Potential brand impersonation")
        
        # CONTENT-BASED ANALYSIS (for social engineering detection)
        full_text_lower = text.lower()
        
        # Social engineering phrases
        social_eng_phrases = [
            'you\'ve won', 'claim now', 'gift card', 'winner', 'congratulations',
            'has been compromised', 'suspended', 'verify now', 'update immediately',
            'avoid suspension', 'account locked', 'security alert', 'expired',
            'click here', 'act now', 'limited time', 'urgent', 'immediate action'
        ]
        
        social_eng_count = sum(1 for phrase in social_eng_phrases if phrase in full_text_lower)
        
        if social_eng_count >= 2:
            high_risk_score += 40
            suspicious_flags.append(f"Social engineering language detected ({social_eng_count} phrases)")
        elif social_eng_count == 1:
            high_risk_score += 20
            suspicious_flags.append("Potential social engineering language")
        
        # Check for urgency indicators
        urgency_words = ['immediate', 'urgent', 'now', 'quickly', 'asap', 'expire', 'suspend']
        urgency_count = sum(1 for word in urgency_words if word in full_text_lower)
        
        if urgency_count >= 2:
            high_risk_score += 25
            suspicious_flags.append("High urgency language (pressure tactics)")
        
        # Check for fake domain with legitimate brand + suspicious context
        fake_domain_patterns = [
            r'apple.*-.*\.', r'paypal.*-.*\.', r'amazon.*-.*\.', r'google.*-.*\.',
            r'microsoft.*-.*\.', r'facebook.*-.*\.', r'netflix.*-.*\.',
            r'.*-apple.*\.', r'.*-paypal.*\.', r'.*-amazon.*\.', r'.*-google.*\.'
        ]
        
        has_fake_domain = False
        for pattern in fake_domain_patterns:
            if re.search(pattern, full_text_lower):
                has_fake_domain = True
                break
        
        if has_fake_domain:
            high_risk_score += 60
            suspicious_flags.append("Fake domain impersonating legitimate brand")
        
        # Check for legitimate domain in suspicious context
        legit_domains = ['instagram.com', 'apple.com', 'paypal.com', 'dropbox.com', 'google.com', 
                        'amazon.com', 'microsoft.com', 'facebook.com', 'twitter.com', 'netflix.com']
        
        contains_legit_domain = any(domain in full_text_lower for domain in legit_domains)
        
        if contains_legit_domain and (social_eng_count >= 1 or urgency_count >= 1):
            high_risk_score += 35
            suspicious_flags.append("Legitimate domain used in suspicious context (possible impersonation)")
        
        # Check for excessive suspicious keywords
        if url_features.get('suspicious_keyword_count', 0) > 2:
            high_risk_score += 25  # Reduced from 35 since we have better content analysis now
            suspicious_flags.append(f"Multiple suspicious keywords ({url_features['suspicious_keyword_count']})")
        
        # Check for IP address instead of domain
        if url_features.get('has_ip_address', 0):
            high_risk_score += 40
            suspicious_flags.append("IP address instead of domain name")
        
        # Check for very low TLD popularity (likely fake) - but only for truly invalid TLDs
        tld_pop = url_features.get('tld_popularity', 100)
        if tld_pop < 20 and url_features.get('has_invalid_tld', 0):
            high_risk_score += 30
            suspicious_flags.append("Uncommon/suspicious top-level domain")
        
        # Rule-based override: If high risk score, classify as phishing regardless of ML
        # BUT: Lower threshold if dealing with known legitimate domains
        parsed_url = urlparse(text if '://' in text else 'http://' + text)
        domain_in_text = parsed_url.netloc.lower()
        
        known_legit_domains = [
            'instagram.com', 'apple.com', 'paypal.com', 'dropbox.com', 'google.com',
            'amazon.com', 'microsoft.com', 'facebook.com', 'twitter.com', 'netflix.com',
            'github.com', 'stackoverflow.com', 'wikipedia.org', 'reddit.com'
        ]
        
        is_known_legit_domain = any(legit in domain_in_text for legit in known_legit_domains)
        
        # Adjust threshold based on domain legitimacy
        if is_known_legit_domain:
            # Higher threshold for known legitimate domains - but still catch obvious phishing
            phishing_threshold = 50  # Lowered from 60 to catch more social engineering
        else:
            # Lower threshold for unknown domains
            phishing_threshold = 40
        
        if high_risk_score >= phishing_threshold:
            pred = "PHISHING"
            prob = min(0.95, 0.5 + (high_risk_score / 150))  # More aggressive probability scaling
            
            return pred, prob, suspicious_flags
        
        # If no major red flags, proceed with ML classification
        try:
            # Vectorize text
            X_word_new = vect_word.transform([text_cleaned])
            X_char_new = vect_char.transform([text_cleaned])
            
            # Scale URL features
            url_feature_vals = np.array([url_features[col] for col in feature_columns]).reshape(1, -1)
            X_url_scaled_new = scaler.transform(url_feature_vals)
            
            # Combine features
            X_new = hstack([X_word_new, X_char_new, csr_matrix(X_url_scaled_new)])
            
            # Predict
            if hasattr(clf, "predict_proba"):
                ml_prob = clf.predict_proba(X_new)[0, 1]
                
                # Adjust probability based on suspicious indicators
                adjusted_prob = ml_prob + (high_risk_score / 200)  # Boost probability
                adjusted_prob = min(0.99, adjusted_prob)  # Cap at 99%
                
                pred = "PHISHING" if adjusted_prob >= threshold else "LEGITIMATE"
                prob = adjusted_prob
            else:
                pred_val = clf.predict(X_new)[0]
                pred = "PHISHING" if pred_val == 1 else "LEGITIMATE"
                prob = pred_val
            
        except Exception as e:
            # Fallback to rule-based if ML fails
            pred = "PHISHING" if high_risk_score >= 20 else "LEGITIMATE"
            prob = high_risk_score / 100
            suspicious_flags.append(f"ML classification failed: {str(e)}")
        
        return pred, prob, suspicious_flags
    
    # Interactive mode
    print("\n" + "="*50)
    print("ENHANCED PHISHING DETECTION SYSTEM")
    print("="*50)
    print("Enter URLs or text to analyze (type 'exit' to quit)")
    
    while True:
        try:
            user_input = input("\nEnter URL/text: ").strip()
            if user_input.lower() in ['exit', 'quit', 'q']:
                break
            
            if not user_input:
                continue
            
            pred, prob, flags = enhanced_classify_input(user_input)
            
            print(f"\nPrediction: {pred}")
            print(f"Phishing Probability: {prob:.3f}")
            
            if flags:
                print("Suspicious Indicators:")
                for flag in flags:
                    print(f"  • {flag}")
            
            # Risk level
            if prob >= 0.7:
                risk = "HIGH RISK ⚠️"
            elif prob >= 0.5:
                risk = "HIGH RISK"
            elif prob >= 0.3:
                risk = "MEDIUM RISK"
            else:
                risk = "LOW RISK"
            
            print(f"Risk Level: {risk}")
            
        except KeyboardInterrupt:
            print("\nExiting...")
            break
        except Exception as e:
            print(f"Error processing input: {e}")

if __name__ == "__main__":
    main()

Loaded 60054 samples from text
Label distribution: {1: 43918, 0: 16136}
Applying enhanced text preprocessing...
Extracting comprehensive URL features...
Creating enhanced TF-IDF features...
Feature matrix shape: (60054, 8554)
Label distribution: [16136 43918]
Training enhanced model...

Model Evaluation:
Accuracy: 1.0000
ROC AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4841
           1       1.00      1.00      1.00     13176

    accuracy                           1.00     18017
   macro avg       1.00      1.00      1.00     18017
weighted avg       1.00      1.00      1.00     18017


Confusion Matrix:
[[ 4841     0]
 [    0 13176]]

Saving models...
Models saved successfully!

ENHANCED PHISHING DETECTION SYSTEM
Enter URLs or text to analyze (type 'exit' to quit)
