In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report
import re
from urllib.parse import urlparse
import time

In [18]:
data = pd.read_csv("malicious_phish.csv")

In [19]:
df = pd.DataFrame(data)

In [20]:
len(df)

651191

In [21]:
df['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [22]:
label_mapping = {
    'benign': 0,
    'phishing': 1,
    'defacement': 1,
    'malware': 1
}

In [23]:
df['target'] = df['type'].map(label_mapping)

In [24]:
if df['target'].isnull().sum() > 0:
    print("\nWarning: Ada label yang tidak dikenali, menghapus baris tersebut...")
    df = df.dropna(subset=['target'])

Feature Extraction

In [25]:
def get_features(url):
    features = {}

    # Konversi ke string jaga-jaga kalau ada data bukan string
    url = str(url)

    # A. Fitur Panjang
    features['url_length'] = len(url)
    features['hostname_length'] = len(urlparse(url).netloc)
    features['path_length'] = len(urlparse(url).path)

    # B. Fitur Karakter Spesial
    features['count_dot'] = url.count('.')
    features['count_hyphen'] = url.count('-')
    features['count_at'] = url.count('@')
    features['count_question'] = url.count('?')
    features['count_percent'] = url.count('%')
    features['count_www'] = url.count('www')

    # C. Fitur Pola 
    features['count_digits'] = sum(c.isdigit() for c in url)
    features['count_letters'] = sum(c.isalpha() for c in url)

    return pd.Series(features)

print("\nSedang mengekstrak fitur...")
feature_df = df['url'].apply(get_features)


    


Sedang mengekstrak fitur...


In [26]:
final_df = pd.concat([feature_df, df['target']], axis=1)

In [27]:
final_df

Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_hyphen,count_at,count_question,count_percent,count_www,count_digits,count_letters,target
0,16,0,16,2,1,0,0,0,0,0,13,1
1,35,0,35,2,0,0,0,0,0,1,29,0
2,31,0,31,2,0,0,0,0,0,1,25,0
3,88,21,10,3,1,0,1,0,1,7,63,1
4,235,23,10,2,1,0,1,0,0,22,199,1
...,...,...,...,...,...,...,...,...,...,...,...,...
651186,39,0,39,3,0,0,0,0,0,12,21,1
651187,44,0,44,2,2,0,0,0,0,7,29,1
651188,42,0,42,2,0,0,0,0,1,3,33,1
651189,45,0,45,2,0,0,0,0,0,0,36,1


Training Data

In [28]:
# fitur, pake drop untuk ngehapus kolom target.
X = final_df.drop('target', axis=1)

In [29]:
# target dismpan di y, outputnya 0 atau 1
y = final_df['target']

In [30]:
# bagian train data, pake function train_test_split(), test_size=0.2 itu agar dibagi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
X_train.shape

(520952, 11)

In [32]:
models = {}

In [33]:
models["rf"] = RandomForestClassifier(n_estimators=100, random_state=42)

In [34]:
models['xgb'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

TRAINING MODEL

In [35]:
for name, model in models.items():
    start_time = time.time()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    elapsed = time.time() - start_time

    print(f"\nðŸ”¹ Model: {name}")
    print(f"   Akurasi: {acc*100:.2f}%")
    print(f"   Waktu Training: {elapsed:.4f} detik")
    print("   Laporan Klasifikasi:")
    print(classification_report(y_test, y_pred))
    
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(f"   Deteksi Benar (Phishing tertangkap): {tp}")
    print(f"   Salah Prediksi (Phishing lolos/False Negative): {fn}  <-- INI YANG BAHAYA")
    print("-" * 40)


ðŸ”¹ Model: rf
   Akurasi: 96.61%
   Waktu Training: 77.2166 detik
   Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     85778
           1       0.97      0.93      0.95     44461

    accuracy                           0.97    130239
   macro avg       0.97      0.96      0.96    130239
weighted avg       0.97      0.97      0.97    130239

   Deteksi Benar (Phishing tertangkap): 41445
   Salah Prediksi (Phishing lolos/False Negative): 3016  <-- INI YANG BAHAYA
----------------------------------------


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



ðŸ”¹ Model: xgb
   Akurasi: 95.66%
   Waktu Training: 1.2022 detik
   Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     85778
           1       0.97      0.90      0.93     44461

    accuracy                           0.96    130239
   macro avg       0.96      0.94      0.95    130239
weighted avg       0.96      0.96      0.96    130239

   Deteksi Benar (Phishing tertangkap): 40168
   Salah Prediksi (Phishing lolos/False Negative): 4293  <-- INI YANG BAHAYA
----------------------------------------


In [36]:
import joblib

joblib.dump(model, "model_phishing.pkl")

['model_phishing.pkl']

Optimasi Model XGBoost 

In [37]:
numeric_features = df['url'].apply(get_features)


In [38]:
numeric_features

Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_hyphen,count_at,count_question,count_percent,count_www,count_digits,count_letters
0,16,0,16,2,1,0,0,0,0,0,13
1,35,0,35,2,0,0,0,0,0,1,29
2,31,0,31,2,0,0,0,0,0,1,25
3,88,21,10,3,1,0,1,0,1,7,63
4,235,23,10,2,1,0,1,0,0,22,199
...,...,...,...,...,...,...,...,...,...,...,...
651186,39,0,39,3,0,0,0,0,0,12,21
651187,44,0,44,2,2,0,0,0,0,7,29
651188,42,0,42,2,0,0,0,0,1,3,33
651189,45,0,45,2,0,0,0,0,0,0,36


TF-IDF Fitur

In [39]:
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=5000)
text_features = tfidf.fit_transform(df['url'])

In [40]:
text_features.shape

(651191, 5000)

In [41]:
from scipy.sparse import hstack

In [42]:
X_final = hstack([numeric_features.astype(float), text_features])
y_final = df['target']

In [43]:
y_final

0         1
1         0
2         0
3         1
4         1
         ..
651186    1
651187    1
651188    1
651189    1
651190    1
Name: target, Length: 651191, dtype: int64

In [44]:
import joblib
loaded_model = joblib.load("model_phishing.pkl")
print("Model berhasil di-load!")

Model berhasil di-load!


Split