In [1]:
import pandas as pd
import re
from datetime import datetime
import numpy as np

In [2]:
def parse_nginx_log(log_line):

    pattern = r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'
    
    match = re.match(pattern, log_line)
    if match:
        ip, timestamp, request, status, bytes_sent, referrer, user_agent = match.groups()
        
        # Parse request menjadi method dan path
        request_parts = request.split()
        method = request_parts[0] if len(request_parts) > 0 else ''
        path = request_parts[1] if len(request_parts) > 1 else ''
        
        return {
            'ip_address': ip,
            'timestamp': timestamp,
            'method': method,
            'path': path,
            'status_code': int(status),
            'bytes_sent': int(bytes_sent),
            'referrer': referrer,
            'user_agent': user_agent
        }

    return None

In [3]:
def process_logs():
    # Baca file log
    log_entries = []
    with open('data/access.log', 'r') as file:
        for line in file:
            parsed = parse_nginx_log(line.strip())
            if parsed:
                log_entries.append(parsed)

    # Convert ke DataFrame
    df = pd.DataFrame(log_entries)
    return df

**Request count**
- Mendeteksi Brute Force Attack: Penyerang sering melakukan banyak request dalam waktu singkat
- Identifikasi DDoS: Volume request yang sangat tinggi dari satu IP
- Scanner Detection: Tools scanning otomatis biasanya melakukan request berulang
- Rate Limiting Check: Mendeteksi IP yang melebihi batas normal request

**Error count** 
- Deteksi Path Traversal: Banyak error 404 menunjukkan scanning direktori
- SQL Injection Attempts: Error 500 berulang menunjukkan percobaan injeksi
- Authentication Attacks: Error 401/403 berulang menunjukkan percobaan unauthorized access
- Vulnerability Scanning: Tool scanning menghasilkan banyak error response

**avg_bytes (Rata-rata Ukuran Response)**
- Data Exfiltration: Response size besar menunjukkan potensi pencurian data
- DoS Detection: Request yang menghasilkan response sangat besar
- Resource Abuse: Identifikasi penggunaan bandwidth berlebihan
- Anomaly Detection: Penyimpangan dari ukuran response normal

**error_rate (Rasio Error)**
- Attack Success Rate: Rasio tinggi menunjukkan serangan berulang
- Tool Fingerprinting: Scanner otomatis menghasilkan rasio error tinggi
- Attack Pattern Recognition: Perbedaan rasio error normal vs serangan
- Security Misconfiguration: Error rate tinggi bisa menunjukkan misconfig

Contoh: 
- Normal: <10% error rate
- Warning: 10-30% error rate
- Critical: >30% error rate

In [4]:

def calculate_suspicious_score(row):
    score = 0
    
    # High request count
    if row['request_count'] > 100:
        score += 0.3
    
    # High error count
    if row['error_count'] > 10:
        score += 0.4
    
    # Unusual average bytes
    if row['avg_bytes'] > 10000:
        score += 0.3
    
    return score

In [5]:
def extract_features(df):
    # Groupby IP address
    ip_features = df.groupby('ip_address').agg({
        'method': 'count',  # Request count
        'status_code': lambda x: (x >= 400).sum(),  # Error countrequest_count
        'bytes_sent': 'mean',  # Average response size
    }).reset_index()
    
    # Rename columns
    ip_features.columns = ['ip_address', 'request_count', 'error_count', 'avg_bytes']
    
    # Calculate error rate
    ip_features['error_rate'] = ip_features['error_count'] / ip_features['request_count']
    
    # Calculate suspicious score
    ip_features['suspicious_score'] = ip_features.apply(calculate_suspicious_score, axis=1)
    
    return ip_features

Menyeragamkan Skala Data
   - Semua nilai diubah ke range 0-1
   - Memudahkan perbandingan antar fitur

Formula: (x - min) / (max - min)


In [6]:
def min_max_normalize(data, column):
    """
    Fungsi untuk melakukan normalisasi Min-Max scaling manual
    Formula: (x - min(x)) / (max(x) - min(x))
    """
    min_val = min(data[column])
    max_val = max(data[column])
    
    if max_val == min_val:
        return [0] * len(data[column])
    
    normalized = [(x - min_val) / (max_val - min_val) for x in data[column]]
    return normalized

In [7]:
def normalize_data(features_df):
    # Kolom yang akan dinormalisasi
    columns_to_normalize = ['request_count', 'error_count', 'avg_bytes', 'error_rate', 'suspicious_score']
    
    # Normalisasi setiap kolom
    for column in columns_to_normalize:
        features_df[column] = min_max_normalize(features_df, column)
    
    return features_df

1. Request Count + Error Rate:
   - Normal: Banyak request, error rate rendah
   - Attack: Banyak request, error rate tinggi

2. Error Count + Avg Bytes:
   - Normal: Error sedikit, bytes normal
   - Attack: Error banyak, bytes tidak normal

3. Suspicious Score sebagai Agregat:
   - Mengkombinasikan semua metrik
   - Memberikan penilaian komprehensif

In [8]:
def assign_labels(row):
    """
    Label:
    0: Normal Traffic
    1: Suspicious Activity
    2: Potential Attack
    3: Confirmed Attack
    """
    score = 0
    
    if row['request_count'] > 0.8:
        score += 2
    elif row['request_count'] > 0.5:
        score += 1
    
    if row['error_rate'] > 0.7:
        score += 2
    elif row['error_rate'] > 0.4:
        score += 1
    
    if row['suspicious_score'] > 0.8:
        score += 2
    elif row['suspicious_score'] > 0.5:
        score += 1
    
    if score >= 5:
        return 3
    elif score >= 3:
        return 2
    elif score >= 1:
        return 1
    return 0

In [9]:
def print_statistics(df):
    print("\nStatistik Data:")
    print("-" * 50)
    print("\nJumlah data:", len(df))
    
    print("\nNilai minimum setiap kolom:")
    for column in df.columns:
        if column not in ['ip_address', 'label']:
            print(f"{column}: {min(df[column]):.4f}")
    
    print("\nNilai maksimum setiap kolom:")
    for column in df.columns:
        if column not in ['ip_address', 'label']:
            print(f"{column}: {max(df[column]):.4f}")
    
    print("\nDistribusi Label:")
    label_counts = df['label'].value_counts().sort_index()
    for label, count in label_counts.items():
        print(f"Label {label}: {count} data")

In [10]:
def main():
    # Proses log
    print("Membaca dan memproses access.log...")
    df = process_logs()
    
    
    df.to_csv('data/logs.csv', index=False)
    
    # Ekstrak fitur
    print("Mengekstrak fitur...")
    features_df = extract_features(df)
    
    # Normalisasi data
    print("Melakukan normalisasi data...")
    normalized_df = normalize_data(features_df)
    
    # Tambahkan label
    print("Menambahkan label...")
    normalized_df['label'] = normalized_df.apply(assign_labels, axis=1)
    
    # Simpan ke CSV
    output_file = 'data/normalized_logs_full.csv'
    normalized_df.to_csv(output_file, index=False)
    print(f"\nData telah disimpan ke {output_file}")
    
    # Tampilkan statistik
    print_statistics(normalized_df)
    
    print("\nSampel data (5 baris pertama):")
    print(normalized_df.head(100).to_string())

if __name__ == "__main__":
    main()

Membaca dan memproses access.log...
Mengekstrak fitur...
Melakukan normalisasi data...
Menambahkan label...

Data telah disimpan ke data/normalized_logs_full.csv

Statistik Data:
--------------------------------------------------

Jumlah data: 124

Nilai minimum setiap kolom:
request_count: 0.0000
error_count: 0.0000
avg_bytes: 0.0000
error_rate: 0.0000
suspicious_score: 0.0000

Nilai maksimum setiap kolom:
request_count: 1.0000
error_count: 1.0000
avg_bytes: 1.0000
error_rate: 1.0000
suspicious_score: 1.0000

Distribusi Label:
Label 0: 55 data
Label 1: 65 data
Label 2: 4 data

Sampel data (5 baris pertama):
         ip_address  request_count  error_count  avg_bytes  error_rate  suspicious_score  label
0      1.34.111.115       0.000000     0.000000   0.000970    0.000000               0.0      0
1     101.36.106.89       0.014815     0.008368   0.089766    0.285714               0.3      0
2    103.149.26.249       0.002469     0.004184   0.000922    0.500000               0.0      1
