In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import os

import glob

# ==========================================
# CONFIGURATION
# ==========================================
DATASET_FILE = '/kaggle/input/adsqwer/ftp_combined_dataset.csv'

def load_and_label_data():
    if not os.path.exists(DATASET_FILE):
        print(f"ERROR: {DATASET_FILE} not found. Please run combine_csvs.py first.")
        return None
        
    print(f"Loading {DATASET_FILE}...")
    df = pd.read_csv(DATASET_FILE)
    
    # Auto-labeling Logic
    # The file already has 'label' (0 or 1) from the merge script.
    # We just need to refine the attack labels:
    
    # Label = 1 (Brute Force) - Default for attack files
    
    # Refine Label = 2 (Post Exploit) if command is suspicious
    # We only apply this to rows that are ALREADY marked as attack (label=1)
    # to avoid false positives if a normal user does a LIST (though unlikely in this lab setup)
    post_exploit_cmds = ['RETR', 'STOR', 'DELE', 'MKD', 'RMD', 'SITE']
    df.loc[(df['label'] == 1) & (df['ftp.request.command'].isin(post_exploit_cmds)), 'label'] = 2
    
    print(f"Data Distribution:\n{df['label'].value_counts()}")
    
    return df

def preprocess_data(df):
    print("Preprocessing data...")
    
    # Fill NaNs
    df.fillna({'ftp.request.command': 'NONE', 'ftp.response.code': 0, 'ftp.response.arg': 'NONE'}, inplace=True)
    
    # Feature Engineering
    # Encode categorical columns
    le_cmd = LabelEncoder()
    df['ftp.command_enc'] = le_cmd.fit_transform(df['ftp.request.command'].astype(str))
    
    # FIX: Encode TCP flags if they are strings (e.g. 'PA', 'A')
    le_flags = LabelEncoder()
    df['tcp.flags_enc'] = le_flags.fit_transform(df['tcp.flags'].astype(str))
    
    # We can also use packet length and flags
    # We drop IPs for the model to generalize (avoid learning specific IP addresses)
    # UPDATED: Use the encoded flags instead of the raw string flags
    features = ['frame.len', 'tcp.srcport', 'tcp.dstport', 'tcp.flags_enc', 'ftp.command_enc', 'ftp.response.code']
    
    X = df[features]
    y = df['label']
    
    return X, y

def train_model(X, y):
    print("Training Random Forest Model...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    print("Model Training Complete.")
    
    # Evaluation
    y_pred = clf.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Benign', 'BruteForce', 'PostExploit'] if 2 in y.values else ['Benign', 'Attack']))
    
    return clf

if __name__ == "__main__":
    df = load_and_label_data()
    if df is not None and len(df) > 0:
        X, y = preprocess_data(df)
        model = train_model(X, y)
        print("\nDone! The model is trained and ready.")
        
        # Example prediction
        print("\nTest Prediction (Simulated 'RETR' command):")
        # Construct a fake input matching our features: len=100, ports=random, flags=24(PA), cmd='RETR', code=226
        # Note: You would need to handle the LabelEncoder transformation properly in production
        pass


Loading /kaggle/input/adsqwer/ftp_combined_dataset.csv...
Data Distribution:
label
1    11348
0     1869
2       27
Name: count, dtype: int64
Preprocessing data...
Training Random Forest Model...
Model Training Complete.

Classification Report:
              precision    recall  f1-score   support

      Benign       0.68      0.64      0.66       561
  BruteForce       0.94      0.95      0.95      3405
 PostExploit       0.78      0.88      0.82         8

    accuracy                           0.91      3974
   macro avg       0.80      0.82      0.81      3974
weighted avg       0.90      0.91      0.91      3974


Done! The model is trained and ready.

Test Prediction (Simulated 'RETR' command):


In [None]:
# ==========================================================
# FTP Intrusion Detection using Time‑Window Aggregation
# ==========================================================

import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ==========================================================
# CONFIGURATION
# ==========================================================
DATASET_FILE = '/kaggle/input/adsqwer/ftp_combined_dataset.csv'
# TIME_WINDOW = '5s'   # 2s, 5s, 10s (try multiple)
# TIME_WINDOW = '2s'
# TIME_WINDOW = '5s'
TIME_WINDOW = '10s'

POST_EXPLOIT_CMDS = ['RETR', 'STOR', 'DELE', 'MKD', 'RMD', 'SITE']

# ==========================================================
# LOAD + AUTO LABEL
# ==========================================================
def load_and_label_data():
    if not os.path.exists(DATASET_FILE):
        raise FileNotFoundError(f"{DATASET_FILE} not found")

    print(f"[+] Loading dataset: {DATASET_FILE}")
    df = pd.read_csv(DATASET_FILE)

    # Ensure numeric timestamp
    df['frame.time_epoch'] = pd.to_numeric(
        df['frame.time_epoch'], errors='coerce'
    )

    # Fill missing fields
    df.fillna({
        'ftp.request.command': 'NONE',
        'ftp.response.code': 0,
        'tcp.flags': 'NONE'
    }, inplace=True)

    # Refine post‑exploit label
    df.loc[
        (df['label'] == 1) &
        (df['ftp.request.command'].isin(POST_EXPLOIT_CMDS)),
        'label'
    ] = 2

    print("\n[+] Label distribution:")
    print(df['label'].value_counts())

    return df


# ==========================================================
# TIME WINDOW FEATURE ENGINEERING
# ==========================================================
def create_time_window_features(df, window_size='5s'):
    print(f"\n[+] Creating time‑window features ({window_size})")

    df = df.copy()

    # Create flow identifier
    df['flow_id'] = (
        df['tcp.srcport'].astype(str) + "_" +
        df['tcp.dstport'].astype(str)
    )

    # Convert timestamp
    df['timestamp'] = pd.to_datetime(df['frame.time_epoch'], unit='s')
    df = df.sort_values('timestamp')

    windows = []

    # Convert string window_size like '5s' → Timedelta
    td_window = pd.to_timedelta(window_size)

    for flow, g in df.groupby('flow_id'):
        g = g.sort_values('timestamp')
        start_time = g['timestamp'].min()

        # Create a moving window manually
        current_start = start_time
        while current_start <= g['timestamp'].max():
            current_end = current_start + td_window
            window_rows = g[(g['timestamp'] >= current_start) & (g['timestamp'] < current_end)]

            if len(window_rows) == 0:
                current_start = current_end
                continue

            row = {
                'pkt_count': len(window_rows),
                'bytes_sum': window_rows['frame.len'].sum(),
                'bytes_mean': window_rows['frame.len'].mean(),
                'user_cmd_count': (window_rows['ftp.request.command'] == 'USER').sum(),
                'pass_cmd_count': (window_rows['ftp.request.command'] == 'PASS').sum(),
                'post_exploit_cmd_count': window_rows['ftp.request.command'].isin(POST_EXPLOIT_CMDS).sum(),
                'login_success_count': (window_rows['ftp.response.code'] == 230).sum(),
                'syn_count': (window_rows['tcp.flags'] == 'S').sum(),
                'pa_count': (window_rows['tcp.flags'] == 'PA').sum(),
                'label': window_rows['label'].mode()[0]  # majority label
            }

            windows.append(row)
            current_start = current_end

    windowed_df = pd.DataFrame(windows)
    print(f"[+] Generated {len(windowed_df)} window samples")
    return windowed_df

# ==========================================================
# TRAIN + EVALUATE MODEL
# ==========================================================
def train_model(df_windowed):
    print("\n[+] Training model")

    X = df_windowed.drop(columns=['label'])
    y = df_windowed['label']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=42,
        stratify=y
    )

    clf = RandomForestClassifier(
        n_estimators=200,
        max_depth=12,
        random_state=42,
        class_weight='balanced'
    )

    clf.fit(X_train, y_train)

    print("[+] Model training complete")

    y_pred = clf.predict(X_test)

    print("\n===== Classification Report =====")
    unique_labels = sorted(y_test.unique())
    target_names = []
    for lbl in unique_labels:
        if lbl == 0:
            target_names.append('Benign')
        elif lbl == 1:
            target_names.append('BruteForce')
        else:
            target_names.append('PostExploit')
    
    print(classification_report(y_test, y_pred, target_names=target_names))
    print("\n===== Confusion Matrix =====")
    print(confusion_matrix(y_test, y_pred))

    return clf


# ==========================================================
# MAIN
# ==========================================================
# if __name__ == "__main__":
df = load_and_label_data()
df_windowed = create_time_window_features(df, TIME_WINDOW)
model = train_model(df_windowed)

print("\n[✓] FTP IDS model is ready!")


[+] Loading dataset: /kaggle/input/adsqwer/ftp_combined_dataset.csv

[+] Label distribution:
label
1    11348
0     1869
2       27
Name: count, dtype: int64

[+] Creating time‑window features (10s)
