In [None]:
import os, html, re, joblib
import numpy as np
import pandas as pd
from urllib.parse import urlparse
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix

try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except ImportError:
    from sklearn.ensemble import RandomForestClassifier
    XGB_AVAILABLE = False

# ---------------- CONFIG ----------------
DATA_FILE = '/Users/yahyamohnd/Downloads/Phishing_dataset_full_large.csv'
OUT_MODEL = 'xgb_phish_model.pkl'
OUT_VECT_WORD = 'tfidf_word.pkl'
OUT_VECT_CHAR = 'tfidf_char.pkl'
OUT_SCALER = 'url_scaler.pkl'
TEST_SIZE = 0.4
RANDOM_STATE = 42

# --------------- Load Data ----------------
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"Data file not found: {DATA_FILE}")

df = pd.read_csv(DATA_FILE)
if 'text' not in df.columns:
    if 'URL' in df.columns:
        df['text'] = df['URL'].astype(str)
    else:
        text_cols = [c for c in df.columns if df[c].dtype==object]
        if len(text_cols)>0:
            df['text'] = df[text_cols[0]].astype(str)
        else:
            raise ValueError("No text column found")

if 'label' not in df.columns:
    raise ValueError("CSV must contain a 'label' column with 0/1")
df['label'] = df['label'].astype(int)

# --------------- Text Cleaning ----------------
def clean_text(s):
    if pd.isna(s): return ''
    s = html.unescape(str(s)).lower()
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r'http\S+|www\.\S+', '<URL>', s)
    s = re.sub(r'\d+', '<NUM>', s)
    s = re.sub(r'[^\w\s<>]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df['text'] = df['text'].apply(clean_text)

# --------------- URL Features ----------------
url_feature_cols = [
    'url_length','has_ip_address','dot_count','https_flag','url_entropy','token_count',
    'subdomain_count','query_param_count','tld_length','path_length','has_hyphen_in_domain',
    'number_of_digits','tld_popularity','suspicious_file_extension','domain_name_length','percentage_numeric_chars'
]

def extract_url_features(s):
    is_url = '<url>' in s or s.startswith('http') or s.startswith('www') or '://' in s
    if not is_url:
        return {k:0 for k in url_feature_cols}
    p = urlparse(s if '://' in s else 'http://'+s)
    full = s if '://' in s else 'http://'+s
    netloc = p.netloc; path = p.path or ''; query = p.query or ''
    url_length = len(full)
    parts = netloc.split('.')
    has_ip_address = 1 if len(parts)==4 and all(pt.isdigit() for pt in parts) else 0
    dot_count = netloc.count('.') + path.count('.')
    https_flag = 1 if p.scheme=='https' else 0
    uniq_chars = len(set(full))
    url_entropy = round((uniq_chars/(len(full)+1))*10,3)
    token_count = max(1,len([t for t in (path+('?' + query if query else '')).split('/') if t]))
    subdomain_count = max(0, netloc.count('.')-1)
    query_param_count = 1 if query else 0
    tld = netloc.split('.')[-1] if '.' in netloc else ''
    tld_length = len(tld)
    path_length = len(path)
    has_hyphen_in_domain = 1 if '-' in netloc else 0
    number_of_digits = sum(c.isdigit() for c in full)
    tld_pop_map = {'com':1000,'org':300,'net':200,'io':100,'co':150,'info':50}
    tld_popularity = tld_pop_map.get(tld,10)
    suspicious_file_extension = 1 if any(full.endswith(ext) for ext in ['.exe','.zip','.php','.asp']) else 0
    domain_name_length = len(netloc)
    percentage_numeric_chars = round((number_of_digits/(len(full)+1))*100,3)
    return {
        "url_length": url_length, "has_ip_address": has_ip_address, "dot_count": dot_count,
        "https_flag": https_flag, "url_entropy": url_entropy, "token_count": token_count,
        "subdomain_count": subdomain_count, "query_param_count": query_param_count,
        "tld_length": tld_length, "path_length": path_length, "has_hyphen_in_domain": has_hyphen_in_domain,
        "number_of_digits": number_of_digits, "tld_popularity": tld_popularity,
        "suspicious_file_extension": suspicious_file_extension, "domain_name_length": domain_name_length,
        "percentage_numeric_chars": percentage_numeric_chars
    }

# Compute missing URL features
missing_features = [c for c in url_feature_cols if c not in df.columns]
if missing_features:
    feats_df = df['text'].apply(lambda t: pd.Series(extract_url_features(t)))
    for c in missing_features:
        df[c] = feats_df[c].values
existing_url_features = url_feature_cols

# ---------------- Feature Matrix ----------------
vect_word = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=4000)
vect_char = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=3000)

Xw = vect_word.fit_transform(df['text'])
Xc = vect_char.fit_transform(df['text'])
X_url_scaled = StandardScaler().fit_transform(df[existing_url_features].values)
X = hstack([Xw, Xc, csr_matrix(X_url_scaled)])
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                    stratify=y, random_state=RANDOM_STATE)

# ---------------- Train ----------------
pos = (y_train==1).sum(); neg = (y_train==0).sum()
scale_pos_weight = neg/pos if pos>0 else 1.0
if XGB_AVAILABLE:
    clf = XGBClassifier(n_estimators=300,max_depth=6,learning_rate=0.1,
                        use_label_encoder=False,eval_metric='logloss',
                        scale_pos_weight=scale_pos_weight,random_state=RANDOM_STATE,n_jobs=-1)
else:
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=300,class_weight='balanced',random_state=RANDOM_STATE,n_jobs=-1)
clf.fit(X_train, y_train)

# ---------------- Evaluation ----------------
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1] if hasattr(clf,"predict_proba") else None
print("Accuracy:", accuracy_score(y_test,y_pred))
if y_prob is not None:
    print("ROC AUC:", roc_auc_score(y_test,y_prob))
print(classification_report(y_test,y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test,y_pred))

# ---------------- Save ----------------
joblib.dump(clf, OUT_MODEL)
joblib.dump(vect_word, OUT_VECT_WORD)
joblib.dump(vect_char, OUT_VECT_CHAR)
joblib.dump(StandardScaler().fit(df[existing_url_features].values), OUT_SCALER)

# ---------------- Interactive ----------------
def classify_input(text, threshold=0.5):
    txt_clean = clean_text(text)
    v_w = vect_word.transform([txt_clean])
    v_c = vect_char.transform([txt_clean])
    vals = np.array([extract_url_features(text)[c] for c in existing_url_features]).reshape(1,-1)
    vals_scaled = StandardScaler().fit(df[existing_url_features].values).transform(vals)
    X_in = hstack([v_w, v_c, csr_matrix(vals_scaled)])
    prob = clf.predict_proba(X_in)[:,1][0] if hasattr(clf,"predict_proba") else clf.predict(X_in)[0]
    pred = "Phishing" if prob>=threshold else "Legit"
    return pred, prob

if __name__=="__main__":
    print("\nInteractive mode — enter text or URL (type 'exit'):")
    while True:
        s = input("Enter text/URL: ").strip()
        if s.lower()=='exit': break
        pred, prob = classify_input(s)
        print(f"Prediction: {pred} | Prob(phish) = {prob:.3f}")

Accuracy: 1.0
ROC AUC: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       1.00      1.00      1.00     20000

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000

Confusion matrix:
 [[20000     0]
 [    0 20000]]

Interactive mode — enter text or URL (type 'exit'):


Enter text/URL:  https://www.amazon.com


Prediction: Legit | Prob(phish) = 0.000


Enter text/URL:  http://amazon.verify-user.com


Prediction: Phishing | Prob(phish) = 1.000


Enter text/URL:  Update your PayPal account immediately to avoid suspension: http://paypal-security-update.com


Prediction: Legit | Prob(phish) = 0.300


Enter text/URL:   "You've won a $1000 gift card! Click to claim: http://free-giftcard-win.com",


Prediction: Legit | Prob(phish) = 0.220


Enter text/URL:          "Alert: Unusual activity detected on your Gmail account. Verify: http://gmail-security-check.com"


Prediction: Phishing | Prob(phish) = 0.687


Enter text/URL:          "Your Amazon account has been locked. Verify here: http://amazon.verify-user.com",


Prediction: Phishing | Prob(phish) = 0.520


Enter text/URL:          "Update your PayPal account immediately to avoid suspension: http://paypal-security-update.com",


Prediction: Legit | Prob(phish) = 0.300


Enter text/URL:  http://paypal-security-update.com


Prediction: Phishing | Prob(phish) = 1.000


Enter text/URL:  https://www.nytimes.com


Prediction: Legit | Prob(phish) = 0.000


Enter text/URL:  http://netflix.account-update.com


Prediction: Phishing | Prob(phish) = 1.000


Enter text/URL:  Update your PayPal account immediately to avoid suspension: http://paypal-security-update.com


Prediction: Legit | Prob(phish) = 0.300


Enter text/URL:  You've won a $1000 gift card! Click to claim: http://free-giftcard-win.com


Prediction: Legit | Prob(phish) = 0.217


Enter text/URL:  http://gmail-security-check.com


Prediction: Phishing | Prob(phish) = 1.000


Enter text/URL:  http://yanya-hellp.ned


Prediction: Phishing | Prob(phish) = 0.503


Enter text/URL:  https://dz.iq


Prediction: Legit | Prob(phish) = 0.003


Enter text/URL:  http://yaya-lasmdf-dfbjnad/.das


Prediction: Phishing | Prob(phish) = 0.743


Enter text/URL:  https://hell0.cds


Prediction: Legit | Prob(phish) = 0.187


Enter text/URL:  https://dasda


Prediction: Legit | Prob(phish) = 0.020


Enter text/URL:  https


Prediction: Legit | Prob(phish) = 0.260


Enter text/URL:  https://alskndlasndl.asdaksn


Prediction: Legit | Prob(phish) = 0.263


Enter text/URL:  http://alskndlasndl.asdaksn


Prediction: Legit | Prob(phish) = 0.470


Enter text/URL:  https://alskndlasndl-sd.sad


Prediction: Phishing | Prob(phish) = 0.620


Enter text/URL:  https://hello-yahya.cpm


Prediction: Legit | Prob(phish) = 0.327


Enter text/URL:  https://hello-yahya.cpm


Prediction: Legit | Prob(phish) = 0.327
