In [15]:
import pandas as pd
import math

In [16]:
def euclidean_distance(point1, point2):
    """
    Menghitung jarak euclidean antara dua titik
    """
    distance = 0
    for i in range(len(point1)):
        distance += (point1[i] - point2[i]) ** 2
    return math.sqrt(distance)

In [17]:
def get_neighbors(X_train, y_train, test_point, k):
    """
    Mencari k tetangga terdekat untuk suatu titik
    """
    distances = []
    for idx, train_point in enumerate(X_train):
        dist = euclidean_distance(test_point, train_point)
        distances.append((dist, y_train[idx]))
    
    distances.sort(key=lambda x: x[0])
    return distances[:k]

In [18]:
def predict_point(neighbors):
    """
    Melakukan prediksi untuk satu titik berdasarkan tetangga terdekat
    """
    votes = {}
    for _, label in neighbors:
        if label in votes:
            votes[label] += 1
        else:
            votes[label] = 1
    
    return max(votes.items(), key=lambda x: x[1])[0]

In [19]:
def knn_predict(X_train, y_train, X_test, k):
    """
    Melakukan prediksi menggunakan KNN untuk semua data test
    """
    predictions = []
    for test_point in X_test:
        neighbors = get_neighbors(X_train, y_train, test_point, k)
        prediction = predict_point(neighbors)
        predictions.append(prediction)
    
    return predictions

In [20]:
def load_data(train_path, test_path):
    """
    Memuat data training dan testing dari file CSV
    """
    # Load training data
    train_df = pd.read_csv(train_path)
    
    # Load testing data
    test_df = pd.read_csv(test_path)
    
    # Definisikan fitur yang akan digunakan
    features = ['request_count', 'error_count', 'avg_bytes', 'error_rate', 'suspicious_score']
    
    # Pisahkan fitur dan label untuk data training
    X_train = train_df[features].values
    y_train = train_df['label'].values
    
    # Ambil fitur untuk data testing (tanpa label)
    X_test = test_df[features].values
    ip_test = test_df['ip_address'].values
    
    return X_train, y_train, X_test, ip_test, train_df, test_df

In [21]:
def get_attack_category(label):
    """
    Mengkonversi label numerik ke kategori serangan
    """
    categories = {
        0: "Normal Traffic",
        1: "Suspicious Activity",
        2: "Potential Attack",
        3: "Confirmed Attack"
    }
    return categories.get(label, "Unknown")

In [22]:
def print_dataset_info(X_train, y_train, X_test, ip_test, train_df, test_df):
    """
    Menampilkan informasi dataset training dan testing
    """
    print("\nINFORMASI DATASET")
    print("=" * 50)
    
    print(f"\nJumlah Data Training: {len(X_train)}")
    print(f"Jumlah Data Testing: {len(X_test)}")
    
    print("\nData Training Sample:")
    print("-" * 100)
    print(f"{'IP Address':<20} {'Label':<20} {'Request Count':<15} {'Error Count':<15} {'Error Rate':<15} {'Suspicious Score':<15}")
    print("-" * 100)
    
    for i in range(min(5, len(X_train))):  # Tampilkan 5 data pertama
        ip = train_df['ip_address'].iloc[i]
        label = get_attack_category(y_train[i])
        req_count, err_count, _, err_rate, susp_score = X_train[i]
        print(f"{ip:<20} {label:<20} {req_count:<15.4f} {err_count:<15.4f} {err_rate:<15.4f} {susp_score:<15.4f}")
    
    print("\nData Testing Sample:")
    print("-" * 100)
    print(f"{'IP Address':<20} {'Request Count':<15} {'Error Count':<15} {'Error Rate':<15} {'Suspicious Score':<15}")
    print("-" * 100)
    
    for i in range(min(5, len(X_test))):  # Tampilkan 5 data pertama
        ip = ip_test[i]
        req_count, err_count, _, err_rate, susp_score = X_test[i]
        print(f"{ip:<20} {req_count:<15.4f} {err_count:<15.4f} {err_rate:<15.4f} {susp_score:<15.4f}")

In [23]:
def print_classification_results(ip_addresses, predictions, test_df):
    """
    Menampilkan hasil klasifikasi
    """
    print("\nHASIL KLASIFIKASI")
    print("=" * 80)
    print(f"{'IP Address':<20} {'Predicted Category':<25} {'Details'}")
    print("-" * 80)
    
    for ip, pred in zip(ip_addresses, predictions):
        predicted_category = get_attack_category(pred)
        
        # Ambil detail metrics untuk IP ini
        ip_data = test_df[test_df['ip_address'] == ip].iloc[0]
        details = f"RC:{ip_data['request_count']:.3f}, EC:{ip_data['error_count']:.3f}, ER:{ip_data['error_rate']:.3f}, SS:{ip_data['suspicious_score']:.3f}"
        
        print(f"{ip:<20} {predicted_category:<25} {details}")

In [24]:
def print_summary_statistics(predictions):
    """
    Menampilkan statistik ringkasan klasifikasi
    """
    print("\nRINGKASAN KLASIFIKASI")
    print("=" * 50)
    
    categories = {
        0: "Normal Traffic",
        1: "Suspicious Activity",
        2: "Potential Attack",
        3: "Confirmed Attack"
    }
    
    predicted_counts = {}
    for pred in predictions:
        category = categories[pred]
        predicted_counts[category] = predicted_counts.get(category, 0) + 1
    
    total = len(predictions)
    for category, count in predicted_counts.items():
        percentage = (count / total) * 100
        print(f"{category:<20}: {count:3d} ({percentage:5.1f}%)")

In [25]:
def main():
    # Load data
    print("Loading data...")
    X_train, y_train, X_test, ip_test, train_df, test_df = load_data(
        'data/normalized_logs.csv',  # Training data
        'data/testing.csv'      # Testing data
    )
    
    # Tampilkan informasi dataset
    print_dataset_info(X_train, y_train, X_test, ip_test, train_df, test_df)
    
    # Set nilai k
    k = 3
    print(f"\nMelakukan prediksi dengan k={k}...")
    
    # Lakukan prediksi
    predictions = knn_predict(X_train, y_train, X_test, k)
    
    # Tampilkan hasil klasifikasi
    print_classification_results(ip_test, predictions, test_df)
    
    # Tampilkan ringkasan statistik
    print_summary_statistics(predictions)

if __name__ == "__main__":
    main()

Loading data...

INFORMASI DATASET

Jumlah Data Training: 124
Jumlah Data Testing: 15

Data Training Sample:
----------------------------------------------------------------------------------------------------
IP Address           Label                Request Count   Error Count     Error Rate      Suspicious Score
----------------------------------------------------------------------------------------------------
1.34.111.115         Normal Traffic       0.0000          0.0000          0.0000          0.0000         
101.36.106.89        Normal Traffic       0.0148          0.0084          0.2857          0.3000         
103.149.26.249       Suspicious Activity  0.0025          0.0042          0.5000          0.0000         
103.164.60.19        Potential Attack     1.0000          0.1506          0.0887          1.0000         
103.186.212.5        Normal Traffic       0.0049          0.0042          0.3333          0.0000         

Data Testing Sample:
------------------------------