In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import f1_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

current_path = os.getcwd()
label_encoder = LabelEncoder()

# Loading Data

In [None]:
selected_features_path = os.path.join(
    current_path,
    "data",
    "original",
    "phase2",
    "selected_features.json"
)
selected_features = pd.read_json(selected_features_path)

In [None]:
df = pd.read_csv(os.path.join(current_path, "data", "processed", "phase2", "processed_data.csv"))
df = df[selected_features]
X = df.drop(columns=['Target'])
y = df['Target']
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models Training

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0, n_jobs=-1)
rf.fit(X_train, y_train)

In [154]:
y_pred = rf.predict(X_test)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.824944,0.885615,0.834291,0.982045,0.820818,0.848,0.86475,0.865952,0.866635
recall,0.924152,0.782542,0.89517,0.952128,0.792104,0.838773,0.86475,0.864145,0.86475
f1-score,0.871735,0.830894,0.863659,0.966855,0.806205,0.843362,0.86475,0.863785,0.864442
support,2004.0,1959.0,1946.0,2068.0,2001.0,2022.0,0.86475,12000.0,12000.0


### Neural Network

In [155]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(random_state=0, max_iter=3000)
nn.fit(X_train, y_train)

In [156]:
y_pred = nn.predict(X_test)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.757344,0.880347,0.712369,0.968473,0.781229,0.759372,0.804,0.809856,0.81084
recall,0.939122,0.72486,0.873073,0.950677,0.590705,0.741345,0.804,0.803297,0.804
f1-score,0.838494,0.795073,0.784576,0.959492,0.672738,0.75025,0.804,0.800104,0.801005
support,2004.0,1959.0,1946.0,2068.0,2001.0,2022.0,0.804,12000.0,12000.0


In [32]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0, max_iter=3000)
lr.fit(X_train, y_train)

In [33]:
y_pred = lr.predict(X_test)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.575208,0.821211,0.543757,0.877385,0.674946,0.552351,0.652983,0.674143,0.674036
recall,0.639614,0.451862,0.971486,0.991779,0.43318,0.429744,0.652983,0.652944,0.652983
f1-score,0.605704,0.582958,0.697251,0.931081,0.527689,0.483394,0.652983,0.638013,0.638007
support,40046.0,39979.0,39980.0,40021.0,39771.0,40203.0,0.652983,240000.0,240000.0


In [34]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_train, y_train)

In [35]:
y_pred = knn.predict(X_test)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

In [36]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ("rf", rf),
    ("nn", nn),
    ("xgb", xgb),
    ("lr", lr),
    ("knn", knn)
]

stacking = StackingClassifier(
    n_jobs=-1,
    estimators=estimators,
    final_estimator=RandomForestClassifier(n_jobs=-1)
)

stacking.fit(X_train, y_train)

In [37]:
y_pred = stacking.predict(X_test)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.998653,0.909896,0.666022,0.999625,0.99995,0.999776,0.911871,0.928987,0.929022
recall,0.999825,0.524375,0.947474,0.999675,0.999925,0.999677,0.911871,0.911825,0.911871
f1-score,0.999239,0.665323,0.7822,0.99965,0.999937,0.999726,0.911871,0.907679,0.907726
support,40046.0,39979.0,39980.0,40021.0,39771.0,40203.0,0.911871,240000.0,240000.0


# Submission

In [1]:
# choose your model
final_model = stacking
submit = False

NameError: name 'stacking' is not defined

In [None]:
testing_data = pd.read_csv(os.path.join(current_path, "data", "original", "phase2", "test.csv"))

id = testing_data['Id']
testing_data = testing_data[selected_features]
testing_data
y_pred = stacking.predict(testing_data)
result = pd.DataFrame(label_encoder.inverse_transform(y_pred), columns=['Target'])
result['Id'] = id
result = result[['Id', 'Target']]

correct_names = ['DDoS', 'Recon', 'BenignTraffic', 'Mitm', 'DoS', 'Mirai']
result['Target'] = result['Target'].map({name.lower(): name for name in correct_names})

result.to_csv(os.path.join(current_path, "data", "original", "phase2", "submission.csv"), index=False)
# !kaggle competitions submit -c csai-253-project-phase-2 -f submission.csv -m "Message"