<a href="https://colab.research.google.com/github/Frz1927/UTSPBO/blob/main/UAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LIBRARY

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

# TUGAS 4 MENCARI DATASET

In [None]:
from google.colab import files
uploaded = files.upload()


# TUGAS 5 DATA *UNDERSTANDING*

1. Memahami Struktur Data


In [None]:
import pandas as pd

df = pd.read_csv('cybersecurity_attacks.csv')
df.head()
df.info()
df.dtypes

2. Statistik Deskriptif

In [None]:
df.describe()

In [None]:
df.describe(include='object')

3. Memeriksa Missing Value

In [None]:
df.isnull().sum()

In [None]:
df.isnull().mean() * 100 # Dalam Bentuk Persentase

4. Distribusi Dan Data Unik

In [None]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()} nilai unik") #Nilai unik setiap kolom (bisa untuk deteksi kategorikal)

In [None]:
df['Severity Level'].value_counts() #menampilkan nilai terbanyak (sebagai contoh "Severity Level")


5. Visualisasi

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram kolom numerik
df.hist(bins=20, figsize=(12, 10))
plt.show()

# Korelasi antar fitur numerik
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.show()


6. Cek Outlier Sederhana

In [None]:
sns.boxplot(x=df['Packet Length']) #kolom numerik


# TUGAS 6 DATA PREPROCESSING HINGGA PERMODELAN

1. Pembersihan data

In [None]:
 # mengisi missing value
df['Alerts/Warnings'].fillna("Tidak Ada Peringatan", inplace=True)
df['IDS/IPS Alerts'].fillna("Tidak Ada Serangan", inplace=True)
df['Malware Indicators'].fillna("Tidak Terdeteksi", inplace=True)
df['Firewall Logs'].fillna("Log Tidak Tersedia", inplace=True)
df['Proxy Information'].fillna("Data Proxy Hilang", inplace=True)

# cek duplikasi
df.drop_duplicates(inplace=True)

# penanganan outlier

numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    batas_bawah = Q1 - 1.5 * IQR
    batas_atas = Q3 + 1.5 * IQR

    outlier_count = df[(df[col] < batas_bawah) | (df[col] > batas_atas)].shape[0]

    print(f"Kolom: {col}")
    print(f"Jumlah outlier: {outlier_count}")

    # Visualisasi
    sns.boxplot(data=df, x=col)
    plt.title(f'boxplot {col}')
    plt.show()

2. Transformasi Data

In [None]:
# Encoding Data Kategorikal (Categorical Encoding)
data = pd.DataFrame({
    'Protocol': ['TCP', 'UDP', 'ICMP']
})

encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(data)

encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Protocol']))

print(encoded_df)

In [None]:
# Normalisasi/Standarisasi Data Numerik (Numerical Data Normalization/Standardization)
numerical_cols = ['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']

df_normalized = df.copy()

scaler = MinMaxScaler()
df_normalized[numerical_cols] = scaler.fit_transform(df_normalized[numerical_cols])

print(df_normalized[numerical_cols].head())

3. Rekayasa Fitur (Feature Engineering) & Pemilihan Fitur (Feature Selection)

In [None]:
# Rekayasa Fitur (Feature Engineering)

# Hitung jumlah kasus untuk setiap kombinasi Traffic Type dan Severity Level
traffic_severity_combination = df.groupby(['Traffic Type', 'Severity Level']).size().reset_index(name='Count')

print(traffic_severity_combination)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Attack Type'] = le.fit_transform(df['Attack Type'])

categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

X = df[categorical_cols]
y = df['Attack Type']

selector = SelectKBest(score_func=chi2, k='all')
selector.fit(X, y)


scores = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': selector.scores_})
scores = scores.sort_values(by='Chi2 Score', ascending=False)
print(scores)

In [None]:
# Pemilihan Algoritma/Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Kolom target = 'Attack Type'
X = df.drop(columns='Attack Type')
y = df['Attack Type']

# Bagi data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
