In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
import re
from urllib.parse import urlparse
import time

In [2]:
data = pd.read_csv("malicious_phish.csv")

In [3]:
df = pd.DataFrame(data)

In [4]:
len(df)

651191

In [5]:
df['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [6]:
label_mapping = {
    'benign': 0,
    'phishing': 1,
    'defacement': 1,
    'malware': 1
}

In [7]:
df['target'] = df['type'].map(label_mapping)

In [8]:
if df['target'].isnull().sum() > 0:
    print("\nWarning: Ada label yang tidak dikenali, menghapus baris tersebut...")
    df = df.dropna(subset=['target'])

Feature Extraction

In [9]:
def get_features(url):
    features = {}

    # Konversi ke string jaga-jaga kalau ada data bukan string
    url = str(url)

    # A. Fitur Panjang
    features['url_length'] = len(url)
    features['hostname_length'] = len(urlparse(url).netloc)
    features['path_length'] = len(urlparse(url).path)

    # B. Fitur Karakter Spesial
    features['count_dot'] = url.count('.')
    features['count_hyphen'] = url.count('-')
    features['count_at'] = url.count('@')
    features['count_question'] = url.count('?')
    features['count_percent'] = url.count('%')
    features['count_www'] = url.count('www')

    # C. Fitur Pola 
    features['count_digits'] = sum(c.isdigit() for c in url)
    features['count_letters'] = sum(c.isalpha() for c in url)

    return pd.Series(features)

print("\nSedang mengekstrak fitur...")
feature_df = df['url'].apply(get_features)


    


Sedang mengekstrak fitur...


In [16]:
feature_df

Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_hyphen,count_at,count_question,count_percent,count_www,count_digits,count_letters
0,16,0,16,2,1,0,0,0,0,0,13
1,35,0,35,2,0,0,0,0,0,1,29
2,31,0,31,2,0,0,0,0,0,1,25
3,88,21,10,3,1,0,1,0,1,7,63
4,235,23,10,2,1,0,1,0,0,22,199
...,...,...,...,...,...,...,...,...,...,...,...
651186,39,0,39,3,0,0,0,0,0,12,21
651187,44,0,44,2,2,0,0,0,0,7,29
651188,42,0,42,2,0,0,0,0,1,3,33
651189,45,0,45,2,0,0,0,0,0,0,36


In [11]:
final_df = pd.concat([feature_df, df['target']], axis=1)

In [17]:
final_df

Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_hyphen,count_at,count_question,count_percent,count_www,count_digits,count_letters,target
0,16,0,16,2,1,0,0,0,0,0,13,1
1,35,0,35,2,0,0,0,0,0,1,29,0
2,31,0,31,2,0,0,0,0,0,1,25,0
3,88,21,10,3,1,0,1,0,1,7,63,1
4,235,23,10,2,1,0,1,0,0,22,199,1
...,...,...,...,...,...,...,...,...,...,...,...,...
651186,39,0,39,3,0,0,0,0,0,12,21,1
651187,44,0,44,2,2,0,0,0,0,7,29,1
651188,42,0,42,2,0,0,0,0,1,3,33,1
651189,45,0,45,2,0,0,0,0,0,0,36,1


Training Data

In [12]:
# fitur, pake drop untuk ngehapus kolom target.
X = final_df.drop('target', axis=1)

In [13]:
# target dismpan di y, outputnya 0 atau 1
y = final_df['target']

In [14]:
# bagian train data, pake function train_test_split(), test_size=0.2 itu agar dibagi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.shape

(520952, 11)

In [18]:
models = {}

In [None]:
models["rf"] = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
try :
    import xgboost as xgb
    print('ada')
    models['xgb'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)