<a href="https://colab.research.google.com/github/Ashwathi1901/Link-Guardian/blob/main/notebooks/URL_dataset_feature_extraction_%26_training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# 1) Load the CSV
df = pd.read_csv("Phishing_Legitimate_full.csv")

# 2) Inspect basic info
print(df.shape)          # rows, columns
print(df.columns)        # list all column names
print(df.head())         # quick preview


(10000, 50)
Index(['id', 'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',
       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',
       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',
       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',
       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',
       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',
       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',
       'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle',
       'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT',
       'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT',
       'PctExtNullSelfRe

In [3]:
label_col = "CLASS_LABEL"
id_cols = ["id"]

In [4]:
#All other columns become features automatically
feature_cols = [c for c in df.columns if c not in [label_col] + id_cols]

In [5]:
print("Number of features:", len(feature_cols))
print("First few features:", feature_cols[:10])

Number of features: 48
First few features: ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent']


In [6]:
#Build X and y
X = df[feature_cols]
y = df[label_col]

print(X.shape, y.shape)

(10000, 48) (10000,)


In [7]:
# Split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 80% train, 20% test
    random_state=42,
    stratify=y            # keep phishing/legit balance
)


In [8]:
#Train the XGBoost model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [9]:
model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)

In [10]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1000
           1       0.99      0.99      0.99      1000

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

AUC: 0.99879


In [11]:
#Saving the model
import joblib
import json
import os

In [12]:
os.makedirs("model_artifacts", exist_ok=True)

joblib.dump(model, "model_artifacts/model.joblib")

with open("model_artifacts/metadata.json", "w") as f:
    json.dump({"feature_cols": list(X.columns)}, f)