In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report
import re
from urllib.parse import urlparse
import time
import os

command utk pertama kali run

In [None]:
# df_whitelist = pd.read_csv(r"dataset\top-1m.csv", names=['no','domain'])
# set_whitelist = set(df_whitelist['domain'].astype(str).values)

# print(f"berhasil memuat {len(set_whitelist)}")

berhasil memuat 1000000


In [None]:
data = pd.read_csv("malicious_phish.csv")
df = pd.DataFrame(data)
label_mapping = {
    'benign': 0,
    'phishing': 1,
    'defacement': 1,
    'malware': 1
}

df['target'] = df['type'].map(label_mapping)

if df['target'].isnull().sum() > 0:
    print("\nWarning: Ada label yang tidak dikenali, menghapus baris tersebut...")
    df = df.dropna(subset=['target'])

Feature Extraction

In [None]:
cache_file = 'extracted_features.csv'

In [None]:
def get_features(url):
    features = {}

    # Konversi ke string jaga-jaga kalau ada data bukan string
    url = str(url)

    # A. Fitur Panjang
    features['url_length'] = len(url)
    features['hostname_length'] = len(urlparse(url).netloc)
    features['path_length'] = len(urlparse(url).path)

    # B. Fitur Karakter Spesial
    features['count_dot'] = url.count('.')
    features['count_hyphen'] = url.count('-')
    features['count_at'] = url.count('@')
    features['count_question'] = url.count('?')
    features['count_percent'] = url.count('%')
    features['count_www'] = url.count('www')

    # C. Fitur Pola 
    features['count_digits'] = sum(c.isdigit() for c in url)
    features['count_letters'] = sum(c.isalpha() for c in url)

    return pd.Series(features)




In [None]:
if os.path.exists(cache_file):
    print(f"File cache ditemuakn: '{cache_file}'.")
    final_df = pd.read_csv(cache_file)
else:
    print('cache file tidak ditemukan. Ekstraksi...')
    feature_df = df['url'].apply(get_features)

    final_df = pd.concat([feature_df, df['target']], axis=1)

    final_df.to_csv(cache_file, index=False)
    print(f"Ekstraksi Selesai!, data disimpan ke '{cache_file}'")

print("Shape Data: ", final_df.shape )
final_df.head()

Split Data

In [None]:
# fitur, pake drop untuk ngehapus kolom target.
X = final_df.drop('target', axis=1)

# target dismpan di y, outputnya 0 atau 1
y = final_df['target']

# bagian train data, pake function train_test_split(), test_size=0.2 itu agar dibagi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Inisiasi Model

In [None]:
models = {}
models["rf"] = RandomForestClassifier(n_estimators=100, random_state=42)
models['xgb'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

TRAINING MODEL

In [None]:
import joblib
model_filename = "model_phishing.pkl"

final_model = None

if os.path.exists(model_filename):
    print(f"Model Ditemukan!: '{model_filename}'. \nLoad model... ")
    final_model = joblib.load(model_filename)
else:
    print("model belum ada. Memulai proses training...")
    for name, model in models.items():
        start_time = time.time()
        print(f"Training model: {name} ... ")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        elapsed = time.time() - start_time

        print(f"   Akurasi: {acc*100:.2f}%")
        print(f"   Waktu Training: {elapsed:.4f} detik")
        print("   Laporan Klasifikasi:")
        print(classification_report(y_test, y_pred))
        
        
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        print(f"   Deteksi Benar (Phishing tertangkap): {tp}")
        print(f"   Salah Prediksi (Phishing lolos/False Negative): {fn}  <-- INI YANG BAHAYA")
        print("-" * 40)

        final_model = model

    joblib.dump(final_model, model_filename)
    print(f"Training Selesai! Model disimpan sebagai '{model_filename}'")


Optimasi Model XGBoost 

TF-IDF Fitur

In [None]:
from scipy.sparse import hstack
numeric_features = pd.read_csv(cache_file)
numeric_without_target = numeric_features.drop('target', axis=1)

filename_tfidf = "tfidf_vectorizer.pkl"

if os.path.exists(filename_tfidf):
    print(f"file tfidf sudah ada!. Melakukan Load File...")
    tfidf = joblib.load(filename_tfidf)
    text_features = tfidf.transform(df['url'])
    print('load file selesai.')
else:
    tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=5000)
    text_features = tfidf.fit_transform(df['url'])

    text_features.shape
    joblib.dump(tfidf, filename_tfidf)

X_final = hstack([numeric_without_target.astype(float), text_features])
y_final = df['target']

y_final


Split Data Baru (Untuk Optimasi XGBoost)

In [None]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

Model XGBoost baru (Tuned)

In [None]:

filename_xgboost = 'xgboost_phising_98acc.pkl'

if os.path.exists(filename_xgboost):
    print(f"Model sudah tersedia di {filename_xgboost}. Melakukan Load...")
    model_xgb_tuned = joblib.load(filename_xgboost)
else:
    model_xgb_tuned = xgb.XGBClassifier(
        n_estimators=500,        
        learning_rate=0.05,      
        max_depth=10,            
        scale_pos_weight=2,      
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1                
    )
    start_time = time.time()
    model_xgb_tuned.fit(X_train_new, y_train_new)
    print(f"Selesai dalam {time.time() - start_time:.2f} detik")
    joblib.dump(model_xgb_tuned, filename_xgboost)


y_pred_new = model_xgb_tuned.predict(X_test_new)

print("\n=== HASIL SETELAH UPGRADE (NUMERIC + TEXT) ===")
print(classification_report(y_test_new, y_pred_new))

tn, fp, fn, tp = confusion_matrix(y_test_new, y_pred_new).ravel()
print(f"False Negative (Bahaya) Sekarang: {fn}")



Testing Model

In [44]:
def predict_phishing(url_input):

    features_num = get_features(url_input) # fitur untuk nomor
    features_num_df = pd.DataFrame([features_num]) # biar jadi dataframe
    features_text = tfidf.transform([url_input]) # fitur text

    X_predict = hstack([features_num_df.astype(float), features_text ]) 

    prediction = model_xgb_tuned.predict(X_predict)[0]
    probabilitas = model_xgb_tuned.predict_proba(X_predict)[0][1]


    
    return features_num, features_num_df, features_text, prediction, probabilitas



In [50]:
hasil = predict_phishing("groups.google.com/group/alt.conspiracy.jfk/browse_thread/thread/885ffad05b486021")
hasil

(url_length         80
 hostname_length     0
 path_length        80
 count_dot           4
 count_hyphen        0
 count_at            0
 count_question      0
 count_percent       0
 count_www           0
 count_digits       11
 count_letters      59
 dtype: int64,
    url_length  hostname_length  path_length  count_dot  count_hyphen  \
 0          80                0           80          4             0   
 
    count_at  count_question  count_percent  count_www  count_digits  \
 0         0               0              0          0            11   
 
    count_letters  
 0             59  ,
 <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 71 stored elements and shape (1, 5000)>,
 np.int64(0),
 np.float32(0.024274217))