In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report
import re
from urllib.parse import urlparse
import time
import os
import tldextract
import joblib

In [2]:
print(xgb.__version__)

2.0.3


In [3]:
data = pd.read_csv("malicious_phish.csv")
df = pd.DataFrame(data)
label_mapping = {
    'benign': 0,
    'phishing': 1,
    'defacement': 1,
    'malware': 1
}

df['target'] = df['type'].map(label_mapping)

if df['target'].isnull().sum() > 0:
    print("\nWarning: Ada label yang tidak dikenali, menghapus baris tersebut...")
    df = df.dropna(subset=['target'])

# Cleaning Data

print(f"Jumlah baris sebelum cleaning: {len(df)}")

df['url'] = df['url'].str.replace(r'^htpps?://', '', regex=True)
df['url'] = df['url'].str.replace(r'^http://', '', regex=True)


df['url'] = df['url'].str.replace(r'^www\.', '', regex=True )


df = df.drop_duplicates(subset=['url'])

df = df.reset_index(drop=True)

print(f"Jumlah baris setelah cleaning: {len(df)}")

Jumlah baris sebelum cleaning: 651191
Jumlah baris setelah cleaning: 636316


In [4]:
df

Unnamed: 0,url,type,target
0,br-icloud.com.br,phishing,1
1,mp3raid.com/music/krizz_kaliko.html,benign,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0
3,garage-pirenne.be/index.php?option=com_content...,defacement,1
4,adventure-nicaragua.net/index.php?option=com_m...,defacement,1
...,...,...,...
636311,xbox360.ign.com/objects/850/850402.html,phishing,1
636312,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,1
636313,gamespot.com/xbox360/action/deadspace/,phishing,1
636314,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,1


Feature Extraction

In [5]:
cache_file = 'extracted_features.csv'

In [7]:
def get_features(url):
    features = {}

    # Konversi ke string jaga-jaga kalau ada data bukan string
    url = str(url)

    if not url.startswith(('http://', 'https://')):
        parse_url = "http://" + url
    else:
        parse_url = url
    
    try:
        parsed = urlparse(parse_url)
        hostname = parsed.netloc
        path = parsed.path
    except ValueError:
        temp_url = parse_url.replace("http://", "").replace("https://", "")
        
        if "/" in temp_url:
            parts = temp_url.split('/', 1)
            hostname = parts[0]
            path = "/" + parts[1]
        else:
            hostname = temp_url
            path = ""

    # A. Fitur Panjang
    features['url_length'] = len(url)
    features['hostname_length'] = len(hostname)
    features['path_length'] = len(path)

    # B. Fitur Karakter Spesial
    features['count_dot'] = url.count('.')
    features['count_hyphen'] = url.count('-')
    features['count_at'] = url.count('@')
    features['count_question'] = url.count('?')
    features['count_percent'] = url.count('%')
    features['count_www'] = url.count('www')

    # C. Fitur Pola 
    features['count_digits'] = sum(c.isdigit() for c in url)
    features['count_letters'] = sum(c.isalpha() for c in url)

    return pd.Series(features)




In [9]:
if os.path.exists(cache_file):
    print(f"File cache ditemuakn: '{cache_file}'.")
    df_feature = pd.read_csv(cache_file)
else:
    print('cache file tidak ditemukan. Ekstraksi...')
    feature_df = df['url'].apply(get_features) 

    df_feature = pd.concat([feature_df, df['target']], axis=1)

    df_feature.to_csv(cache_file, index=False)
    print(f"Ekstraksi Selesai!, data disimpan ke '{cache_file}'")

print("Shape Data: ", df_feature.shape )
df_feature.head()

File cache ditemuakn: 'extracted_features.csv'.
Shape Data:  (636316, 12)


Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_hyphen,count_at,count_question,count_percent,count_www,count_digits,count_letters,target
0,16,16,0,2,1,0,0,0,0,0,13,1
1,35,11,24,2,0,0,0,0,0,1,29,0
2,31,14,17,2,0,0,0,0,0,1,25,0
3,77,17,10,2,1,0,1,0,0,7,56,1
4,228,23,10,2,1,0,1,0,0,22,195,1


In [10]:
df_feature

Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_hyphen,count_at,count_question,count_percent,count_www,count_digits,count_letters,target
0,16,16,0,2,1,0,0,0,0,0,13,1
1,35,11,24,2,0,0,0,0,0,1,29,0
2,31,14,17,2,0,0,0,0,0,1,25,0
3,77,17,10,2,1,0,1,0,0,7,56,1
4,228,23,10,2,1,0,1,0,0,22,195,1
...,...,...,...,...,...,...,...,...,...,...,...,...
636311,39,15,24,3,0,0,0,0,0,12,21,1
636312,44,18,26,2,2,0,0,0,0,7,29,1
636313,38,12,26,1,0,0,0,0,0,3,30,1
636314,45,16,29,2,0,0,0,0,0,0,36,1


In [11]:
print(df_feature['target'].value_counts())

target
0    423592
1    212724
Name: count, dtype: int64


Split Data

In [13]:
# fitur, pake drop untuk ngehapus kolom target.
X = df_feature.drop('target', axis=1)

# target dismpan di y, outputnya 0 atau 1
y = df_feature['target']

# bagian train data, pake function train_test_split(), test_size=0.2 itu agar dibagi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'y_train_new' is not defined

Inisiasi Model

In [16]:
# models = {}
# models["rf"] = RandomForestClassifier(n_estimators=100, random_state=42)
# models['xgb'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_xgb_tuned = xgb.XGBClassifier()
model_xgb_tuned_V2 = xgb.XGBClassifier()

TRAINING MODEL

In [12]:
# import joblib
# model_filename = "model_phishing.pkl" # xgboost tanpa tfidf

# final_model = None # xgboost model tanpa tfidf

# if os.path.exists(model_filename):
#     print(f"Model Ditemukan!: '{model_filename}'. \nLoad model... ")
#     final_model = joblib.load(model_filename)
# else:
#     print("model belum ada. Memulai proses training...")
#     for name, model in models.items():
#         start_time = time.time()
#         print(f"Training model: {name} ... ")

#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         acc = accuracy_score(y_test, y_pred)
#         elapsed = time.time() - start_time

#         print(f"   Akurasi: {acc*100:.2f}%")
#         print(f"   Waktu Training: {elapsed:.4f} detik")
#         print("   Laporan Klasifikasi:")
#         print(classification_report(y_test, y_pred))
        
        
#         tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
#         print(f"   Deteksi Benar (Phishing tertangkap): {tp}")
#         print(f"   Salah Prediksi (Phishing lolos/False Negative): {fn}  <-- INI YANG BAHAYA")
#         print("-" * 40)

#         final_model = model

#     joblib.dump(final_model, model_filename)
#     print(f"Training Selesai! Model disimpan sebagai '{model_filename}'")


Optimasi Model XGBoost 

TF-IDF Fitur

In [16]:
from scipy.sparse import hstack
numeric_features = pd.read_csv(cache_file)
numeric_without_target = numeric_features.drop('target', axis=1)

filename_tfidf = "tfidf_vectorizer.pkl" 

if os.path.exists(filename_tfidf):
    print(f"file tfidf sudah ada!. Melakukan Load File...")
    tfidf = joblib.load(filename_tfidf)
    text_features = tfidf.transform(df['url'])
    print('load file selesai.')
else:
    tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=5000)
    text_features = tfidf.fit_transform(df['url'])

    text_features.shape
    joblib.dump(tfidf, filename_tfidf)

X_final = hstack([numeric_without_target.astype(float), text_features]) # untuk training pada xgboost yang baru
y_final = df['target'] # target untuk xgboost yang baru

y_final 


file tfidf sudah ada!. Melakukan Load File...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


load file selesai.


0         1
1         0
2         0
3         1
4         1
         ..
636311    1
636312    1
636313    1
636314    1
636315    1
Name: target, Length: 636316, dtype: int64

Split Data Baru (Untuk Optimasi XGBoost)

In [17]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [18]:

aman = (y_train_new == 0).sum()
bahaya = (y_train_new == 1).sum()
aman

np.int64(339006)

Model XGBoost baru (Tuned)

In [21]:

filename_xgboost = r'xgboost_phising_98acc.json' # xgboost baru

model_xgb_tuned = xgb.XGBClassifier(
        n_estimators=1000,        
        device='cuda',
        learning_rate=0.05,      
        max_depth=6,            
        scale_pos_weight= aman / bahaya,      
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs= -1 ,
        predictor="gpu_predictor",   
        tree_method='hist'         
    )

if os.path.exists(filename_xgboost):
    print(f"Model sudah tersedia di {filename_xgboost}. Melakukan Load...")
    model_xgb_tuned._estimator_type = "classifier"
    model_xgb_tuned.load_model(filename_xgboost)
    model_xgb_tuned.n_classes_ = 2

else:
    print("Model belum ada, melakukan training")
    
    start_time = time.time()
    model_xgb_tuned.fit(
        X_train_new,
        y_train_new,
        eval_set=[(X_test_new, y_test_new)],
        early_stopping_rounds=30,
        verbose=True
    )
    print(f"Selesai dalam {time.time() - start_time:.2f} detik")
    model_xgb_tuned.save_model(filename_xgboost)


y_pred_new = model_xgb_tuned.predict(X_test_new)

print("\n=== HASIL SETELAH UPGRADE (NUMERIC + TEXT) ===")
print(classification_report(y_test_new, y_pred_new))

tn, fp, fn, tp = confusion_matrix(y_test_new, y_pred_new).ravel()
print(f"False Negative (Bahaya) Sekarang: {fn}")



Model sudah tersedia di xgboost_phising_98acc.json. Melakukan Load...

=== HASIL SETELAH UPGRADE (NUMERIC + TEXT) ===
              precision    recall  f1-score   support

           0       0.96      0.93      0.94     84586
           1       0.87      0.92      0.89     42678

    accuracy                           0.92    127264
   macro avg       0.91      0.92      0.92    127264
weighted avg       0.93      0.92      0.92    127264

False Negative (Bahaya) Sekarang: 3592


Model XGB Tuned V2

In [22]:
filename_xgboost_V2 = r"xgboost_tuned_v2.json"

model_xgb_tuned_V2 = xgb.XGBClassifier(
        booster="gbtree",
        n_estimators=10000,
        learning_rate=0.05,
        max_depth=5,
        min_child_weight=5,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.6,
        reg_alpha=0.5,
        reg_lambda=1.0,
        scale_pos_weight= aman / bahaya,
        device='cuda',
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,    
        predictor="gpu_predictor",  
        tree_method='hist'
        
    )

if os.path.exists(filename_xgboost_V2):
    print(f"Model sudah tersedia di {filename_xgboost_V2} \nmelakukan Load... ")
    model_xgb_tuned_V2._estimator_type = "classifier"
    model_xgb_tuned_V2.load_model(filename_xgboost_V2)
    model_xgb_tuned_V2.n_classes_ = 2

else:
    print("Model belum ada, Melakukan training")
    
    start_time = time.time()
    model_xgb_tuned_V2.fit(
        X_train_new, y_train_new,
        eval_set=[(X_test_new, y_test_new)],
        early_stopping_rounds=50,
        verbose=True
    )
    print(f"Selesai dalam {time.time() - start_time:.2f} detik")
    model_xgb_tuned_V2.save_model(filename_xgboost_V2)

y_pred_new_V2 = model_xgb_tuned_V2.predict(X_test_new)

print("Hasil dari xgb tuned v2")
print(classification_report(y_test_new, y_pred_new_V2))

tn, fp, fn, tp = confusion_matrix(y_test_new, y_pred_new_V2).ravel()
print(f"False Negative (Bahaya) Sekarang: {fn}")
    


Model sudah tersedia di xgboost_tuned_v2.json 
melakukan Load... 
Hasil dari xgb tuned v2
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     84586
           1       0.91      0.94      0.93     42678

    accuracy                           0.95    127264
   macro avg       0.94      0.95      0.94    127264
weighted avg       0.95      0.95      0.95    127264

False Negative (Bahaya) Sekarang: 2492


Testing Model

In [23]:
df_whitelist = pd.read_csv(r"dataset\top-1m.csv", names=['no','domain'])
set_whitelist = set(df_whitelist['domain'].astype(str).values)

print(f"berhasil memuat {len(set_whitelist)}")

berhasil memuat 1000000


In [None]:
# def predict_phishing(url_input):
    
#     ext = tldextract.extract(url_input)
#     clean_domain = f"{ext.domain}.{ext.suffix}"
  


#     if clean_domain in set_whitelist :
#         return print("Link Aman!"), print(f"clean domain: {clean_domain}")
#     else:

#         features_num = get_features(url_input) # fitur untuk nomor
#         features_num_df = pd.DataFrame([features_num]) # biar jadi dataframe
#         features_text = tfidf.transform([url_input]) # fitur text
        
#         X_predict = hstack([features_num_df.astype(float), features_text ]) 

#         prediction = model_xgb_tuned_V2.predict(X_predict)[0]
#         probabilitas = model_xgb_tuned_V2.predict_proba(X_predict)[0][1]
#         percent = probabilitas * 100

#         if prediction == 1 :
#             return print(f"Link tidak aman!, tingkat bahaya {percent:.2f}%"), print(prediction), print(f"clean domain: {clean_domain}")
#         else :
#             return print(f"Link aman!, tingkat bahaya hanya {percent:.2f}%"), print(prediction), clean_domain
        

        


    



======================================================================================

In [25]:
import re

# Fungsi cleaning yang sama dengan saat training
def clean_url_text(url):
    # Hapus http/https
    url = re.sub(r'^https?://', '', url)
    # Hapus www.
    url = re.sub(r'^www\.', '', url)
    return url

def predict_phishing(url_input):
    # 1. Cek Whitelist dulu (Wajib untuk domain valid tapi tidak dikenal model)
    ext = tldextract.extract(url_input)
    clean_domain = f"{ext.domain}.{ext.suffix}"
    
    if clean_domain in set_whitelist:
        return f"Link Aman (Whitelisted)! Domain: {clean_domain}", 0

    # 2. Siapkan Fitur
    # A. Fitur Numerik (Pakai URL ASLI/MENTAH karena butuh http untuk parsing)
    features_num = get_features(url_input) 
    features_num_df = pd.DataFrame([features_num]) 
    
    # B. Fitur Teks (WAJIB CLEANING DULU!)
    # --- PERBAIKAN DI SINI ---
    url_cleaned = clean_url_text(url_input) 
    features_text = tfidf.transform([url_cleaned]) 
    # -------------------------

    # 3. Gabung & Prediksi
    X_predict = hstack([features_num_df.astype(float), features_text])
    
    # Ambil probabilitas kelas 1 (Phishing)
    probabilitas = model_xgb_tuned_V2.predict_proba(X_predict)[0][1]
    prediction = model_xgb_tuned_V2.predict(X_predict)[0]
    percent = probabilitas * 100

    if prediction == 1:
        return f"Link BERBAHAYA! ({percent:.2f}%)", prediction, clean_domain
    else:
        return f"Link Aman. ({percent:.2f}%)", prediction, clean_domain

# Test ulang
hasil = predict_phishing("https://ladanglima.com/distribution-faq/")
print(hasil)

('Link BERBAHAYA! (56.08%)', np.int64(1), 'ladanglima.com')
