In [None]:
# Mengimpor pustaka yang esensial untuk proses SMOTE
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

In [None]:
# Memuat dataset yang sebelumnya telah melalui tahap pra-pemrosesan (EDA_v2.ipynb).
# File CSV ini berisi data yang sudah bersih, di-encode, dan diseimbangkan secara manual.
df = pd.read_csv('../data/Bot_IoT_processedV2.csv')

# Menampilkan informasi dasar DataFrame untuk verifikasi,
# termasuk tipe data dan jumlah nilai non-null.
df.info()
# Menampilkan lima baris pertama untuk inspeksi visual.
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26265 entries, 0 to 26264
Data columns (total 41 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sport            26265 non-null  int64  
 1   dport            26265 non-null  int64  
 2   pkts             26265 non-null  int64  
 3   bytes            26265 non-null  int64  
 4   dur              26265 non-null  float64
 5   mean             26265 non-null  float64
 6   stddev           26265 non-null  float64
 7   sum              26265 non-null  float64
 8   min              26265 non-null  float64
 9   max              26265 non-null  float64
 10  spkts            26265 non-null  int64  
 11  dpkts            26265 non-null  int64  
 12  sbytes           26265 non-null  int64  
 13  dbytes           26265 non-null  int64  
 14  rate             26265 non-null  float64
 15  srate            26265 non-null  float64
 16  drate            26265 non-null  float64
 17  proto_icmp  

Unnamed: 0,sport,dport,pkts,bytes,dur,mean,stddev,sum,min,max,...,flgs_e F,flgs_e t,flgs_e &,flgs_e *,flgs_e d,flgs_e g,flgs_e r,flgs_e s,flgs_eU,attack
0,7108,8976,2,176,5.047946,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,9003,11459,2,120,0.000114,0.000114,0.0,0.000114,0.000114,0.000114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,8004,1648,2,120,0.033741,0.033741,0.0,0.033741,0.033741,0.033741,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,8827,3718,2,120,0.032499,0.032499,0.0,0.032499,0.032499,0.032499,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,7644,2395,6,662,0.000244,0.000244,0.0,0.000244,0.000244,0.000244,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
# Memisahkan dataset menjadi matriks fitur (X) dan vektor target (y).
# 'attack' adalah variabel dependen yang akan diprediksi.
X = df.drop(columns=['attack'])
y = df['attack']

In [None]:
# --- Penerapan SMOTE (Synthetic Minority Over-sampling Technique) ---
#
# Latar Belakang: Meskipun telah dilakukan undersampling sebelumnya, kelas minoritas (attack=0)
# masih memiliki jumlah sampel yang lebih sedikit. SMOTE digunakan untuk menyeimbangkan
# distribusi kelas dengan cara membuat sampel sintetis untuk kelas minoritas.
#
# Cara Kerja: SMOTE bekerja dengan memilih sampel dari kelas minoritas, mencari
# tetangga terdekatnya (k-nearest neighbors), dan membuat titik data baru di
# sepanjang garis yang menghubungkan sampel tersebut dengan tetangganya.
#
# `random_state=42` digunakan untuk memastikan hasil resampling dapat direproduksi.
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Memverifikasi hasil dari SMOTE dengan menampilkan distribusi kelas setelah resampling.
# Output yang diharapkan adalah jumlah sampel yang sama untuk setiap kelas.
print(pd.Series(y_resampled).value_counts())

attack
0    19941
1    19941
Name: count, dtype: int64


In [None]:
# --- Identifikasi Kolom Numerik dan Kategorikal Pasca-Resampling ---
#
# Tujuan: Memisahkan kolom numerik murni dari kolom kategorikal (yang sudah di-encode)
# agar proses penskalaan (scaling) hanya diterapkan pada kolom numerik.

# Daftar semua kolom yang merepresentasikan fitur kategorikal.
# Ini termasuk kolom hasil LabelEncoding ('sport', 'dport') dan OneHotEncoding.
categorical_cols = ["sport", "dport", "proto_icmp", "proto_ipv6-icmp", "proto_tcp", 
                    "proto_udp", "state_CON", "state_ECO", "state_FIN", "state_INT", 
                    "state_NRS", "state_PAR", "state_REQ", "state_RST", "state_URP", 
                    "flgs_e", "flgs_e    F", "flgs_e   t", "flgs_e &", "flgs_e *", 
                    "flgs_e d", "flgs_e g", "flgs_e r", "flgs_e s", "flgs_eU"]

# Mengambil daftar nama kolom dari DataFrame yang sudah di-resampling.
# Kemudian, memfilter daftar tersebut untuk hanya menyisakan kolom numerik murni.
all_cols = X_resampled.columns.tolist()
num_cols = [col for col in all_cols if col not in categorical_cols]

print(num_cols)

['pkts', 'bytes', 'dur', 'mean', 'stddev', 'sum', 'min', 'max', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'srate', 'drate']


In [None]:
# --- Penskalaan Fitur Numerik Pasca-Resampling ---
#
# PENTING: Penskalaan harus diterapkan pada data yang akan digunakan untuk melatih model,
# dalam hal ini adalah data yang telah di-resampling (X_resampled).
scaler = StandardScaler()

# Menerapkan scaler pada kolom numerik dari data yang telah di-resampling.
scaled_result = scaler.fit_transform(X_resampled[num_cols])

# Membuat DataFrame baru dari hasil penskalaan dengan nama kolom dan indeks yang sesuai.
X_resampled_scaled = pd.DataFrame(scaled_result, columns=num_cols, index=X_resampled.index)

In [None]:
# --- Penggabungan Akhir ---
#
# Menggabungkan kembali kolom numerik yang telah di-scaling dengan kolom
# kategorikal dari DataFrame yang telah di-resampling.

# Mengambil kolom kategorikal dari DataFrame yang sudah di-resampling.
X_resampled_categorical = X_resampled[categorical_cols]

# Menggabungkan kedua DataFrame tersebut secara horizontal (axis=1).
X_resampled_processed = pd.concat([X_resampled_scaled, X_resampled_categorical], axis=1)

In [None]:
# Membuat DataFrame final dengan menggabungkan  matriks fitur yang
# telah diproses (X_resampled_processed) dengan vektor target yang telah
# di-resampling (y_resampled).
final_df = pd.concat([X_resampled_processed, y_resampled], axis=1)

In [None]:
# Menampilkan informasi ringkas dari DataFrame final untuk verifikasi akhir.
# Ini memastikan semua kolom dan baris telah digabungkan dengan benar.
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26265 entries, 0 to 26264
Data columns (total 41 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pkts             26265 non-null  float64
 1   bytes            26265 non-null  float64
 2   dur              26265 non-null  float64
 3   mean             26265 non-null  float64
 4   stddev           26265 non-null  float64
 5   sum              26265 non-null  float64
 6   min              26265 non-null  float64
 7   max              26265 non-null  float64
 8   spkts            26265 non-null  float64
 9   dpkts            26265 non-null  float64
 10  sbytes           26265 non-null  float64
 11  dbytes           26265 non-null  float64
 12  rate             26265 non-null  float64
 13  srate            26265 non-null  float64
 14  drate            26265 non-null  float64
 15  sport            26265 non-null  int64  
 16  dport            26265 non-null  int64  
 17  proto_icmp  

In [None]:
# Menampilkan lima baris pertama dari DataFrame final yang siap digunakan.
final_df.head()

Unnamed: 0,pkts,bytes,dur,mean,stddev,sum,min,max,spkts,dpkts,...,flgs_e F,flgs_e t,flgs_e &,flgs_e *,flgs_e d,flgs_e g,flgs_e r,flgs_e s,flgs_eU,attack
0,-0.077967,-0.07111,-0.111816,-0.183322,-0.106786,-0.105419,-0.169021,-0.191095,-0.080777,-0.051129,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,-0.077967,-0.071119,-0.135191,-0.183126,-0.106786,-0.105418,-0.168765,-0.190916,-0.081002,-0.050742,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,-0.077967,-0.071119,-0.135035,-0.12548,-0.106786,-0.105234,-0.093207,-0.138104,-0.081002,-0.050742,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,-0.077967,-0.071119,-0.135041,-0.127609,-0.106786,-0.105241,-0.095998,-0.140055,-0.081002,-0.050742,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,-0.077333,-0.071029,-0.13519,-0.182903,-0.106786,-0.105418,-0.168473,-0.190712,-0.080552,-0.049967,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
# Menyimpan DataFrame yang telah di-resampling dan di-scaling ke file CSV baru.
# File ini akan menjadi input utama untuk tahap pelatihan model.
final_df.to_csv("../Data/Bot_IoT_SMOTE_Resampled.csv", index=False)
print("Resampling dan Scaling selesai. File disimpan sebagai Bot_IoT_SMOTE_Resampled.csv")

Resampling dan Scaling selesai. File disimpan sebagai Bot_IoT_SMOTE_Resampled.csv


In [None]:
# --- Perbandingan Distribusi Kelas ---
#
# Menampilkan perbandingan jumlah sampel per kelas sebelum dan sesudah
# penerapan SMOTE untuk mendemonstrasikan efektivitasnya.
print("Perbandingan Kelas 1 dan 0 Sebelum SMOTE:\n", pd.Series(y).value_counts())
print("|--------------------------------|")
print("Perbandingan Kelas 1 dan 0 Sesudah SMOTE:\n", pd.Series(y_resampled).value_counts())

Perbanding Kelas 1 dan 0 Sebelum SMOTE:
 attack
1    19941
0     6324
Name: count, dtype: int64
|--------------------------------|
Perbanding Kelas 1 dan 0 Sesudah SMOTE:
 attack
0    19941
1    19941
Name: count, dtype: int64


In [18]:
# Mengimpor pustaka joblib untuk menyimpan model atau objek Python.
import joblib

# Menyimpan objek scaler yang telah dilatih ke dalam sebuah file.
# Ini memungkinkan scaler yang sama untuk digunakan kembali nanti, misalnya pada data baru.
joblib.dump(scaler, "../Data/scalerV2.pkl")

['../Data/scalerV2.pkl']