In [None]:
# Hapus semua library yang berpotensi konflik sekaligus
!pip uninstall scikit-learn imbalanced-learn category_encoders -y

# Install kembali semua library yang dibutuhkan
!pip install scikit-learn imbalanced-learn category_encoders

#import model from local /kdn/docs

# Import Data

In [None]:
import pandas as pd
import numpy as np 

df = pd.read_csv('/kaggle/input/hikari-zenkii/ALLFLOWMETER_HIKARI2021.csv')

# Menghapus kolom metadata
toDrop = ['uid', 'Unnamed: 0', 'Unnamed: 0.1','fwd_URG_flag_count', 'bwd_URG_flag_count']
df = df.drop(columns=toDrop)

target2 = ['Background', 'Probing', 'XMRIGCC CryptoMiner']
target3 = ['Background', 'XMRIGCC CryptoMiner']
target4 = ['Background', 'Probing']

df = df[~df['traffic_category'].isin(target1)].copy()

print(df['traffic_category'].value_counts())
print(df.shape)
print(df.columns.tolist())
print(df.head(10))

# Exploratory Data Analysis (Cek Kriteria)

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

features_id = [
    'fwd_data_pkts_tot',
    'fwd_iat.tot',
    'flow_pkts_payload.tot',
    'fwd_pkts_payload.tot',
    'bwd_pkts_payload.tot',
    'flow_duration',
    'flow_iat.tot',
    'fwd_iat.max',
    'bwd_iat.tot',
    'fwd_subflow_pkts',
    'bwd_subflow_pkts',
    'fwd_subflow_bytes',
    'bwd_subflow_bytes',
    'down_up_ratio'
]

features = [        
    'flow_FIN_flag_count',
    'flow_SYN_flag_count',
    'flow_ACK_flag_count',
    'flow_RST_flag_count',
    'fwd_PSH_flag_count',
    'bwd_PSH_flag_count',
    'flow_CWR_flag_count',
    'flow_ECE_flag_count'
]

statistik = df.groupby('traffic_category')[ppt].agg(
    # ['min', 'max', 'mean', 'median', 'std', 'count', 'nunique']
    ['count', 'nunique']
)

print("--- Statistik Deskriptif Fitur Teratas untuk Setiap Jenis Serangan ---")
print(statistik)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Asumsikan 'df' adalah DataFrame lengkap Anda

# 1. Definisikan fitur dan kategori serangan yang ingin dibandingkan
features = [ 
    'flow_duration'
]

for feature in features:
    plt.figure(figsize=(12, 7))
    sns.barplot(
        x='traffic_category', 
        y=feature, 
        data=df
    )
    plt.title(f'Persebaran Fitur "{feature}" per Jenis Serangan', fontsize=16)
    plt.xlabel('Jenis Serangan', fontsize=12)
    plt.ylabel(f'Nilai {feature}', fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Cek tipedata 'object' pada fitur yang ada
df.select_dtypes(include='object').dtypes

# Split Data

In [None]:
# Memisahkan fitur - fitur (x) dengan Label (y) 
x = df.select_dtypes(include=['number']).drop(columns=['Label'], errors='ignore')
y = df['Label']

print(x.shape)

In [None]:
from sklearn.model_selection import train_test_split

#Split Data 
xTrain, xTest, yTrain, yTest = train_test_split( 
    x, y, 
    test_size = 0.2,
    stratify=y,
    random_state=42 
)

print("Dimensi DATA TRAIN (xTrain): ")
print(xTrain.shape)
print("Dimensi DATA TES (xTest): ")
print(xTest.shape)

In [None]:
# Presentase persebaran jenis kelas (Normal/Attack)
print(yTrain.value_counts(normalize=True)*100)

# Feature Selection (Permutation)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm # Progress bar

# (Baseline)
print("Melatih model baseline...")
model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(xTrain, yTrain)

# Performa Baseline
yPred_baseline = model.predict(xTest)
baseline_score = f1_score(yTest, yPred_baseline)
print(f"Skor F1 Baseline: {baseline_score:.4f}")

# Permutasi untuk Setiap Fitur
importances = {}
# Progress Bar
for col in tqdm(xTrain.columns, desc="Menghitung Importance Fitur"):
    # Buat salinan data tes untuk diacak
    xTest_shuffled = xTest.copy()
    
    # Acak nilai HANYA di satu kolom (fitur)
    np.random.shuffle(xTest_shuffled[col].values)
    
    # Lakukan prediksi dengan data yang sudah diacak
    yPred_shuffled = model.predict(xTest_shuffled)
    
    # Hitung skor baru
    shuffled_score = f1_score(yTest, yPred_shuffled)
    
    # Skor importance adalah selisihnya
    importances[col] = baseline_score - shuffled_score

# Peringkat fitur dengan skor tertinggi
print("\n--- Hasil Permutation Feature Importance ---")
importance_df = pd.DataFrame(
    list(importances.items()), 
    columns=['Fitur', 'Importance']
).sort_values('Importance', ascending=False)

print(importance_df.head(15)) 

# --- MEMBUAT DATAFRAME BARU ---

# Tentukan jumlah fitur teratas yang ingin Anda simpan
top_features = 15 # Anda bisa mengubah angka ini

# Ambil daftar nama dari N fitur teratas
selected_feature = importance_df['Fitur'].head(top_features).to_list()

# Buat DataFrame baru hanya dengan fitur-fitur terpilih
xTrain_selected = xTrain[selected_feature]
xTest_selected = xTest[selected_feature]

# --- Verifikasi ---
print(f"\nBerhasil memilih {top_features} fitur teratas.")
print(f"Bentuk xTrain baru: {xTrain_selected.shape}")
print(f"Bentuk xTest baru: {xTest_selected.shape}")

In [None]:
# Seleksi fitur manual

features = [
    'idle.tot',
    'active.tot',
    'flow_pkts_payload.tot',
    'flow_pkts_payload.avg',
    'fwd_pkts_payload.tot',
    'fwd_pkts_payload.avg',
    'bwd_pkts_payload.tot',
    'bwd_pkts_payload.avg',
    'flow_duration',
    'fwd_subflow_pkts',
    'bwd_subflow_pkts',
    'fwd_subflow_bytes',
    'bwd_subflow_bytes'
]

#konsisten muncul di permutasi
features2 = [
    'bwd_header_size_tot',
    'bwd_pkts_payload.tot',
    'fwd_pkts_payload.max',
    'flow_pkts_payload.tot',
    'bwd_pkts_tot',
    'fwd_subflow_bytes',
    'fwd_data_pkts_tot',
    'fwd_iat.tot',
    'bwd_iat.tot',
    'flow_RST_flag_count',
    'flow_duration',
    'idle.tot',
    'active.tot'
]

#class x
features3 = [
    'idle.tot',
    'active.tot',
    'flow_pkts_payload.tot',
    'flow_pkts_payload.avg',
    'fwd_pkts_payload.tot',
    'fwd_pkts_payload.avg',
    'bwd_pkts_payload.tot',
    'bwd_pkts_payload.avg',
    'flow_duration',
    'fwd_subflow_pkts',
    'bwd_subflow_pkts',
    'fwd_subflow_bytes',
    'bwd_subflow_bytes',
    'responh_2',
    'responh_3',
    'responh_4'
]

# Buat DataFrame baru hanya dengan fitur-fitur terpilih
xTrain_selected = xTrain[features3]
xTest_selected = xTest[features3]

xTrain_selected.shape

In [None]:
# Export DF dengan fitur terpilih
import pandas as pd

df_fitur = pd.DataFrame(selected_feature, columns=['Nama Fitur Pilihan'])
df_fitur.to_csv('feature_permutasi.csv', index=False)

print(f"File '{'feature_permutasi.csv'}' berhasil dibuat!")

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

#Scaling Data
scaler = StandardScaler()
xTrain_scaled = scaler.fit_transform(xTrain_selected)
xTest_scaled = scaler.transform(xTest_selected)

print(xTrain_scaled.shape)
print(xTrain_scaled)

In [None]:
import joblib

# Simpan objek 'scaler' 
# joblib.dump(scaler, 'scaler.joblib')
# xTest_selected.to_csv('xTest_selected.csv', index=False)
# xTrain_selected.to_csv('xTrain_selected.csv', index=False)
# joblib.dump(rf_model, 'rf_model_3.joblib')
# joblib.dump(xTest_selected, 'xTest_selected.joblib')

# Hybrid-sampling (SMOTE-ENN)

In [None]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42, n_jobs=-1)
print("Memulai resampling SMOTEEENN...")
xTrain_resampled, yTrain_resampled = smote_enn.fit_resample(xTrain_scaled, yTrain)
print("Resampling selesai!!")

In [None]:
# Cek presentase persebaran setelah prose shybrid-sampling
print(yTrain.value_counts(normalize=True)*100)
print(yTrain_resampled.value_counts(normalize=True)*100)

In [None]:
yTest_category = df.loc[yTest.index, 'traffic_category']

# yTest_category_label = pd.concat([
#     xTest_selected.reset_index(drop=True),
#     yTest_category[['traffic_category']].reset_index(drop=True)
# ], axis=1)

yTest_category_label = pd.concat([
    xTest_selected,
    yTest_category
], axis=1)

yTest_category_label.head
print(yTest_category_label.shape)

yTest_category_label.to_csv('yTest_category.csv')

In [None]:
xTrain.shape

# Training Model (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    criterion='entropy',
    n_estimators=200,
    max_depth=30,
    min_samples_split=2,
    min_samples_leaf=10,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    n_jobs=-1,
    class_weight= 'balanced',
    # class_weight= {0:1, 1:10},
    verbose=2
  )

rf_model.fit(xTrain_resampled, yTrain_resampled) #dengan oversampling
# rf_model.fit(xTrain_scaled, yTrain) #tanpa oversampling

# Evaluasi Model (Random Forest)

In [None]:
import joblib

rf_model_all = joblib.load('/kaggle/input/rf-models/rf_model_1.joblib') 
rf_model_2 = joblib.load('/kaggle/input/rf-models/rf_model_2.joblib') 
rf_model_3 = joblib.load('/kaggle/input/rf-models/rf_model_3.joblib') 

In [None]:
#KLASIFIKASI BINER

from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve, 
    accuracy_score,
    precision_recall_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

yPred_prob = rf_model.predict_proba(xTest_scaled)[:, 1]

precision, recall, thresholds = precision_recall_curve(yTest, yPred_prob)
# Hitung f1-score untuk setiap threshold
f1_scores = np.divide(2 * recall * precision, recall + precision,
                      out=np.zeros_like(recall), where=(recall+precision) != 0)

# threshold untuk F1-score tertinggi
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Threshold optimal yang ditemukan: {best_threshold:.4f}")

# Setting threshold
yPred_binary = (yPred_prob > best_threshold).astype(int)

# Prediksi
print("\nEvaluasi model dengan threshold optimal:")
print(classification_report(yTest, yPred_binary, target_names=['Normal(0)', 'Anomali(1)']))

print("\nConfusion Matrix:")
cm = confusion_matrix(yTest, yPred_binary)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal(0)', 'Anomali(1)'],
            yticklabels=['Normal(0)', 'Anomali(1)'])
plt.title('Confusion Matrix', fontsize=16)
plt.ylabel('Label Aktual')
plt.xlabel('Label Prediksi')
plt.show()

# ROC AUC 
auc_score = roc_auc_score(yTest, yPred_prob)
print(f"\nROC AUC score: {auc_score:.4f}")

fpr, tpr, _ = roc_curve(yTest, yPred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Tabel crosstab
yTest_category = df.loc[yTest.index]['traffic_category']
evaluasi_detail = pd.crosstab(yTest_category, yPred_binary)
print(f"\nHasil evaluasi per jenis serangan:\n{evaluasi_detail}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Asumsikan Anda sudah melatih model multi-kelas:
# mc_model.fit(X_train_scaled, y_train_multiclass)

# 1. Buat Prediksi Multi-Kelas
# Hasilnya akan berupa ['Benign', 'Probing', 'Bruteforce'], dll.
yPred = rf_model.predict(xTest_scaled)

# 2. Tampilkan Laporan Klasifikasi
# Laporan ini akan otomatis menampilkan metrik untuk setiap kategori
print("--- Laporan Klasifikasi Multi-Kelas ---")
print(classification_report(yTest, yPred))

# 3. Tampilkan Confusion Matrix
print("\n--- Confusion Matrix Multi-Kelas ---")
# Dapatkan daftar nama kelas secara urut
class_names = sorted(yTest.unique())

cm = confusion_matrix(yTest, yPred, labels=class_names)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix Multi-Kelas', fontsize=16)
plt.ylabel('Label Aktual')
plt.xlabel('Label Prediksi')
plt.show()

# --- TAMBAHAN: TABEL CROSSTAB ---
# Tabel crosstab
evaluasi_detail = pd.crosstab(yTest, yPred)
print(f"\nHasil evaluasi per jenis serangan:\n{evaluasi_detail}")

# Cek Variabel & Export Hasil

In [None]:
import joblib
%whos

# Menyimpan setiap variabel ke file CSV terpisah
# x.to_csv('x_data.csv', index=False)
# y.to_csv('y_data.csv', index=False)
xTest_selected.to_csv('xTest.csv', index=False)
# sampel_terfilter.to_csv('sample.csv', index=False)
# xTrain.to_csv('xTrain_data.csv', index=False)
# yTest.to_csv('yTest_data.csv', index=False)
# yTrain.to_csv('yTrain_data.csv', index=False)
# joblib.dump(scaler, 'scaler.joblib')
# joblib.dump(rf_model, 'ult_model.joblib')
# joblib.dump(xTest_selected, 'xTest_selected.joblib')

In [None]:
import pandas as pd

feature_list = df.columns.tolist()

# 2. Ubah list menjadi DataFrame Pandas
# Kita beri nama kolomnya 'Nama Fitur'
df_features = pd.DataFrame(feature_list, columns=['Nama Fitur'])

filename = 'hikari2021_features.xlsx'

# Export DataFrame ke file Excel
df_features.to_excel(filename, index=False)

# 5. Beri konfirmasi bahwa file telah dibuat
print(f"File '{filename}' berhasil dibuat!")
print(f"Total fitur yang diekspor: {len(feature_list)}")