In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report
import re
from urllib.parse import urlparse
import time
import os
import tldextract

In [2]:
data = pd.read_csv("malicious_phish.csv")
df = pd.DataFrame(data)
label_mapping = {
    'benign': 0,
    'phishing': 1,
    'defacement': 1,
    'malware': 1
}

df['target'] = df['type'].map(label_mapping)

if df['target'].isnull().sum() > 0:
    print("\nWarning: Ada label yang tidak dikenali, menghapus baris tersebut...")
    df = df.dropna(subset=['target'])

Feature Extraction

In [3]:
cache_file = 'extracted_features.csv'

In [4]:
def get_features(url):
    features = {}

    # Konversi ke string jaga-jaga kalau ada data bukan string
    url = str(url)

    # A. Fitur Panjang
    features['url_length'] = len(url)
    features['hostname_length'] = len(urlparse(url).netloc)
    features['path_length'] = len(urlparse(url).path)

    # B. Fitur Karakter Spesial
    features['count_dot'] = url.count('.')
    features['count_hyphen'] = url.count('-')
    features['count_at'] = url.count('@')
    features['count_question'] = url.count('?')
    features['count_percent'] = url.count('%')
    features['count_www'] = url.count('www')

    # C. Fitur Pola 
    features['count_digits'] = sum(c.isdigit() for c in url)
    features['count_letters'] = sum(c.isalpha() for c in url)

    return pd.Series(features)




In [5]:
if os.path.exists(cache_file):
    print(f"File cache ditemuakn: '{cache_file}'.")
    final_df = pd.read_csv(cache_file)
else:
    print('cache file tidak ditemukan. Ekstraksi...')
    feature_df = df['url'].apply(get_features)

    final_df = pd.concat([feature_df, df['target']], axis=1)

    final_df.to_csv(cache_file, index=False)
    print(f"Ekstraksi Selesai!, data disimpan ke '{cache_file}'")

print("Shape Data: ", final_df.shape )
final_df.head()

File cache ditemuakn: 'extracted_features.csv'.
Shape Data:  (651191, 12)


Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_hyphen,count_at,count_question,count_percent,count_www,count_digits,count_letters,target
0,16,0,16,2,1,0,0,0,0,0,13,1
1,35,0,35,2,0,0,0,0,0,1,29,0
2,31,0,31,2,0,0,0,0,0,1,25,0
3,88,21,10,3,1,0,1,0,1,7,63,1
4,235,23,10,2,1,0,1,0,0,22,199,1


In [6]:
print(final_df['target'].value_counts())

target
0    428103
1    223088
Name: count, dtype: int64


Split Data

In [7]:
# fitur, pake drop untuk ngehapus kolom target.
X = final_df.drop('target', axis=1)

# target dismpan di y, outputnya 0 atau 1
y = final_df['target']

# bagian train data, pake function train_test_split(), test_size=0.2 itu agar dibagi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Inisiasi Model

In [8]:
models = {}
models["rf"] = RandomForestClassifier(n_estimators=100, random_state=42)
models['xgb'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

TRAINING MODEL

In [9]:
import joblib
model_filename = "model_phishing.pkl" # xgboost tanpa tfidf

final_model = None # xgboost model tanpa tfidf

if os.path.exists(model_filename):
    print(f"Model Ditemukan!: '{model_filename}'. \nLoad model... ")
    final_model = joblib.load(model_filename)
else:
    print("model belum ada. Memulai proses training...")
    for name, model in models.items():
        start_time = time.time()
        print(f"Training model: {name} ... ")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        elapsed = time.time() - start_time

        print(f"   Akurasi: {acc*100:.2f}%")
        print(f"   Waktu Training: {elapsed:.4f} detik")
        print("   Laporan Klasifikasi:")
        print(classification_report(y_test, y_pred))
        
        
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        print(f"   Deteksi Benar (Phishing tertangkap): {tp}")
        print(f"   Salah Prediksi (Phishing lolos/False Negative): {fn}  <-- INI YANG BAHAYA")
        print("-" * 40)

        final_model = model

    joblib.dump(final_model, model_filename)
    print(f"Training Selesai! Model disimpan sebagai '{model_filename}'")


Model Ditemukan!: 'model_phishing.pkl'. 
Load model... 


Optimasi Model XGBoost 

TF-IDF Fitur

In [10]:
from scipy.sparse import hstack
numeric_features = pd.read_csv(cache_file)
numeric_without_target = numeric_features.drop('target', axis=1)

filename_tfidf = "tfidf_vectorizer.pkl" 

if os.path.exists(filename_tfidf):
    print(f"file tfidf sudah ada!. Melakukan Load File...")
    tfidf = joblib.load(filename_tfidf)
    text_features = tfidf.transform(df['url'])
    print('load file selesai.')
else:
    tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=5000)
    text_features = tfidf.fit_transform(df['url'])

    text_features.shape
    joblib.dump(tfidf, filename_tfidf)

X_final = hstack([numeric_without_target.astype(float), text_features]) # untuk training pada xgboost yang baru
y_final = df['target'] # target untuk xgboost yang baru

y_final 


file tfidf sudah ada!. Melakukan Load File...
load file selesai.


0         1
1         0
2         0
3         1
4         1
         ..
651186    1
651187    1
651188    1
651189    1
651190    1
Name: target, Length: 651191, dtype: int64

Split Data Baru (Untuk Optimasi XGBoost)

In [11]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

Model XGBoost baru (Tuned)

In [12]:

filename_xgboost = 'xgboost_phising_98acc.pkl' # xgboost baru

if os.path.exists(filename_xgboost):
    print(f"Model sudah tersedia di {filename_xgboost}. Melakukan Load...")
    model_xgb_tuned = joblib.load(filename_xgboost)
else:
    model_xgb_tuned = xgb.XGBClassifier(
        n_estimators=500,        
        learning_rate=0.05,      
        max_depth=10,            
        scale_pos_weight=2,      
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1                
    )
    start_time = time.time()
    model_xgb_tuned.fit(X_train_new, y_train_new)
    print(f"Selesai dalam {time.time() - start_time:.2f} detik")
    joblib.dump(model_xgb_tuned, filename_xgboost)


y_pred_new = model_xgb_tuned.predict(X_test_new)

print("\n=== HASIL SETELAH UPGRADE (NUMERIC + TEXT) ===")
print(classification_report(y_test_new, y_pred_new))

tn, fp, fn, tp = confusion_matrix(y_test_new, y_pred_new).ravel()
print(f"False Negative (Bahaya) Sekarang: {fn}")



Model sudah tersedia di xgboost_phising_98acc.pkl. Melakukan Load...

=== HASIL SETELAH UPGRADE (NUMERIC + TEXT) ===
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     85778
           1       0.97      0.97      0.97     44461

    accuracy                           0.98    130239
   macro avg       0.98      0.98      0.98    130239
weighted avg       0.98      0.98      0.98    130239

False Negative (Bahaya) Sekarang: 1296


Model XGB Tuned V2

In [17]:
filename_xgboost_V2 = r"models\xgboost_tuned_v2.pkl"

if os.path.exists(filename_xgboost_V2):
    print(f"Model sudah tersedia di {filename_xgboost_V2} \nmelakukan Load... ")
    model_xgb_tuned_V2 = joblib.load(filename_xgboost_V2)
else:
    print("Model belum ada, Melakukan training")
    model_xgb_tuned_V2 = xgb.XGBClassifier(
        booster="gbtree",
        # Boosting strategy
        n_estimators=600,
        learning_rate=0.05,

        # Tree complexity (PALING PENTING)
        max_depth=5,
        min_child_weight=5,
        gamma=0.1,

        # Randomness & generalization (WAJIB utk TF-IDF)
        subsample=0.8,
        colsample_bytree=0.6,

        # Regularization (KRUSIAL utk text)
        reg_alpha=0.5,
        reg_lambda=1.0,

        # Class imbalance (HITUNG, JANGAN TEBAK)
        scale_pos_weight= 1.92,

        # Performance
        tree_method="hist",
        eval_metric="logloss",

        # Reproducibility & system
        random_state=42,
        n_jobs=1    
    )
    start_time = time.time()
    model_xgb_tuned_V2.fit(X_train_new, y_train_new)
    print(f"Selesai dalam {time.time() - start_time:.2f} detik")
    joblib.dump(model_xgb_tuned_V2, filename_xgboost_V2)

y_pred_new_V2 = model_xgb_tuned_V2.predict(X_test_new)

print("Hasil dari xgb tuned v2")
print(classification_report(y_test_new, y_pred_new_V2))

tn, fp, fn, tp = confusion_matrix(y_test_new, y_pred_new_V2).ravel()
print(f"False Negative (Bahaya) Sekarang: {fn}")
    


Model belum ada, Melakukan training
Selesai dalam 452.55 detik
Hasil dari xgb tuned v2
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     85778
           1       0.96      0.96      0.96     44461

    accuracy                           0.97    130239
   macro avg       0.97      0.97      0.97    130239
weighted avg       0.97      0.97      0.97    130239

False Negative (Bahaya) Sekarang: 1708


Testing Model

In [14]:
df_whitelist = pd.read_csv(r"dataset\top-1m.csv", names=['no','domain'])
set_whitelist = set(df_whitelist['domain'].astype(str).values)

print(f"berhasil memuat {len(set_whitelist)}")

berhasil memuat 1000000


In [20]:
def predict_phishing(url_input):
    
    ext = tldextract.extract(url_input)
    clean_domain = f"{ext.domain}.{ext.suffix}"
  


    if clean_domain in set_whitelist :
        return print("Link Aman!"), print(f"clean domain: {clean_domain}")
    else:

        features_num = get_features(url_input) # fitur untuk nomor
        features_num_df = pd.DataFrame([features_num]) # biar jadi dataframe
        features_text = tfidf.transform([url_input]) # fitur text
        
        X_predict = hstack([features_num_df.astype(float), features_text ]) 

        prediction = model_xgb_tuned_V2.predict(X_predict)[0]
        probabilitas = model_xgb_tuned_V2.predict_proba(X_predict)[0][1]
        percent = probabilitas * 100

        if prediction == 1 :
            return print(f"Link tidak aman!, tingkat bahaya {percent:.2f}%"), print(prediction), print(f"clean domain: {clean_domain}")
        else :
            return print(f"Link aman!, tingkat bahaya hanya {percent:.2f}%"), print(prediction), clean_domain
        

        


    



In [None]:
hasil = predict_phishing("https://www.ladanglima.com")
hasil

Link tidak aman!, tingkat bahaya 99.98%
1
clean domain: commm.


(None, None, None)

======================================================================================