<a href="https://colab.research.google.com/github/AirisAgraf/Anser1/blob/main/IPS_Ansser2_Success.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import requests
from io import BytesIO
import gzip
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer

# Κατέβασμα και αποσυμπίεση του συνόλου δεδομένων KDD Cup 1999
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
response = requests.get(url, stream=True)
compressed_file = BytesIO(response.content)
decompressed_file = gzip.GzipFile(fileobj=compressed_file)


# Φόρτωση του συνόλου δεδομένων
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "target"]
df = pd.read_csv(decompressed_file, header=None, names=columns)

# Δημιουργία των ετικετών (Normal=0, Attack=1)
X = df.drop("target", axis=1)
y = df["target"].apply(lambda x: 0 if x == "normal." else 1)

# Διαχωρισμός δεδομένων σε training και test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Καθορισμός κατηγορικών και αριθμητικών χαρακτηριστικών
categorical_features = ['protocol_type', 'service', 'flag']
numeric_features = X.columns.difference(categorical_features)

# Μετασχηματισμός χαρακτηριστικών
 # Define the column transformer outside the loop
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features) #sparse=False for SMOTENC compatibility
])
# Δημιουργία του ταξινομητή και pipeline
classifier = RandomForestClassifier(n_estimators=100, warm_start=True, random_state=42)
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

# Fit the preprocessor on the training data
preprocessor.fit(X_train)

# Συνεχής μάθηση με batch processing
batch_size = 10000
blocking_threshold = 0.9  # Δυναμικά θα προσαρμόζεται
for epoch in range(1, 3):
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train.iloc[i:i + batch_size]
        y_batch = y_train.iloc[i:i + batch_size]

        # Εφαρμογή oversampling για αντιμετώπιση ανισορροπίας κλάσεων
        categorical_features_indices = [X_batch.columns.get_loc(col) for col in categorical_features]

        smote = SMOTENC(categorical_features=categorical_features_indices, random_state=42)

        # Apply preprocessor before SMOTE
        X_batch_transformed = preprocessor.transform(X_batch)

        # Resample the transformed batch
        X_batch_resampled, y_batch_resampled = smote.fit_resample(X_batch_transformed, y_batch)

        # Instead of converting back to DataFrame with original column names,
        # fit the classifier directly on the transformed data

        # Σταδιακή εκπαίδευση του μοντέλου
        # The pipeline already contains the preprocessor,
        # so you only need to fit the classifier part
        pipeline.named_steps['classifier'].fit(X_batch_resampled, y_batch_resampled)

# Τελική αξιολόγηση
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
blocked_indices = np.where(y_pred_proba > blocking_threshold)[0]
accuracy = accuracy_score(y_test, y_pred_proba > blocking_threshold)
classification_rep = classification_report(y_test, y_pred_proba > blocking_threshold)
print(f"Final Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_rep)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Final Test Accuracy: 0.9953
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     19456
           1       1.00      0.99      1.00     79349

    accuracy                           1.00     98805
   macro avg       0.99      1.00      0.99     98805
weighted avg       1.00      1.00      1.00     98805

