In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from scipy.sparse import csr_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_PATH = './data/'
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [3]:
NB15_FEATURES = 'UNSW-NB15_features-modified.csv'

In [4]:
metadata = pd.read_csv(DATA_PATH + TRAIN_PATH + NB15_FEATURES)

metadata

Unnamed: 0,No.,Name,Type,Description,feature_type
0,5,proto,nominal,Transaction protocol,flow
1,6,state,nominal,Indicates to the state and its dependent proto...,basic
2,7,dur,Float,Record total duration,basic
3,8,sbytes,Integer,Source to destination transaction bytes,basic
4,9,dbytes,Integer,Destination to source transaction bytes,basic
5,10,sttl,Integer,Source to destination time to live value,basic
6,11,dttl,Integer,Destination to source time to live value,basic
7,12,sloss,Integer,Source packets retransmitted or dropped,basic
8,13,dloss,Integer,Destination packets retransmitted or dropped,basic
9,14,service,nominal,"http, ftp, smtp, ssh, dns, ftp-data ,irc and ...",basic


In [5]:
ADDITIONAL_FEATURES_TRAIN = 'additional_features_train.csv'
BASIC_FEATURES_TRAIN = 'basic_features_train.csv'
CONTENT_FEATURES_TRAIN = 'content_features_train.csv'
FLOW_FEATURES_TRAIN = 'flow_features_train.csv'
LABELS_TRAIN = 'labels_train.csv'
TIME_FEATURES_TRAIN = 'time_features_train.csv'

In [6]:
dataset_paths_train = [
    ADDITIONAL_FEATURES_TRAIN,
    BASIC_FEATURES_TRAIN,
    CONTENT_FEATURES_TRAIN,
    FLOW_FEATURES_TRAIN,
    LABELS_TRAIN,
    TIME_FEATURES_TRAIN
]

In [7]:
all_data_train = [pd.read_csv(DATA_PATH + TRAIN_PATH + dp) for dp in dataset_paths_train]

df_train = all_data_train[0]

for df_i in all_data_train[1:]:
    df_train = pd.merge(df_train, df_i, on='id')

df_train.drop(["label"], axis=1, inplace=True)

df_train

Unnamed: 0,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,...,response_body_len,proto,attack_cat,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
0,0.0,0.0,0.0,0.0,0.0,11.0,,5.0,4.0,2.0,...,0.0,tcp,Normal,4449.110313,3234.831566,11.845558,6.261361,,0.000444,0.000114
1,0.0,2.0,0.0,0.0,0.0,10.0,10.0,10.0,10.0,,...,0.0,udp,Generic,0.000000,0.000000,0.009000,0.000000,0.000000,0.000000,
2,0.0,,0.0,0.0,0.0,4.0,4.0,2.0,2.0,1.0,...,0.0,tcp,Exploits,8561.040438,249.950547,165.386453,172.345750,0.158826,0.057902,0.100924
3,0.0,0.0,0.0,0.0,0.0,9.0,9.0,3.0,2.0,2.0,...,0.0,tcp,Normal,4053.086020,2918.730804,8.669644,4.496707,0.000558,0.000448,
4,0.0,0.0,0.0,0.0,0.0,3.0,3.0,4.0,3.0,1.0,...,0.0,udp,Normal,0.000000,0.000000,0.008000,0.007000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,0.0,2.0,0.0,0.0,0.0,38.0,38.0,32.0,,32.0,...,0.0,udp,Generic,0.000000,0.000000,0.005000,0.000000,0.000000,0.000000,0.000000
175337,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,3.0,,...,,tcp,Exploits,15464.286700,531.848125,193.445428,209.600766,0.066524,0.047738,0.018786
175338,0.0,2.0,0.0,,0.0,33.0,33.0,16.0,16.0,16.0,...,0.0,udp,Generic,0.000000,0.000000,0.008000,0.000000,0.000000,0.000000,0.000000
175339,0.0,2.0,0.0,0.0,0.0,36.0,36.0,,37.0,36.0,...,0.0,udp,Generic,0.000000,0.000000,0.004000,0.000000,0.000000,0.000000,


In [8]:
ADDITIONAL_FEATURES_TEST = 'additional_features_test.csv'
BASIC_FEATURES_TEST = 'basic_features_test.csv'
CONTENT_FEATURES_TEST = 'content_features_test.csv'
FLOW_FEATURES_TEST = 'flow_features_test.csv'
TIME_FEATURES_TEST = 'time_features_test.csv'

In [9]:
dataset_paths_test = [
    ADDITIONAL_FEATURES_TEST,
    BASIC_FEATURES_TEST,
    CONTENT_FEATURES_TEST,
    FLOW_FEATURES_TEST,
    TIME_FEATURES_TEST
]

In [10]:
all_data_test = [pd.read_csv(DATA_PATH + TEST_PATH + dp) for dp in dataset_paths_test]

df_test = all_data_test[0]

for df_i in all_data_test[1:]:
    df_test = pd.merge(df_test, df_i, on='id')

df_test

Unnamed: 0,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,...,trans_depth,response_body_len,proto,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
0,0.0,1.0,0.0,0.0,0.0,5.0,5.0,2.0,2.0,2.0,...,0.0,0.0,tcp,2737.954123,118.833969,48.756556,76.593602,0.165117,0.072001,0.093116
1,0.0,1.0,0.0,,0.0,6.0,6.0,1.0,1.0,1.0,...,0.0,0.0,tcp,2938.299144,165.780563,49.812539,109.557602,0.223604,0.100248,0.123356
2,0.0,1.0,0.0,0.0,0.0,4.0,4.0,1.0,2.0,1.0,...,0.0,0.0,tcp,4287.453629,129.471406,69.765530,94.395906,0.113189,0.082498,0.030691
3,0.0,2.0,0.0,0.0,0.0,10.0,4.0,2.0,4.0,2.0,...,0.0,0.0,udp,0.000000,0.000000,0.001000,0.000000,0.000000,0.000000,0.000000
4,,0.0,,0.0,0.0,13.0,11.0,10.0,7.0,6.0,...,0.0,0.0,tcp,1119.063538,26.748141,17.628799,15.543294,0.000655,0.000526,0.000129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20578,0.0,1.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,...,0.0,0.0,tcp,1902.551328,143.722203,16.088482,87.730055,0.117474,0.071119,0.046355
20579,0.0,0.0,0.0,0.0,0.0,14.0,7.0,1.0,9.0,1.0,...,0.0,0.0,tcp,858.139465,22.118752,14.832222,13.803000,0.000647,0.000491,0.000156
20580,0.0,2.0,0.0,0.0,0.0,6.0,6.0,,4.0,4.0,...,0.0,0.0,udp,0.000000,0.000000,,0.000000,0.000000,,0.000000
20581,0.0,2.0,0.0,0.0,0.0,11.0,11.0,4.0,8.0,4.0,...,0.0,0.0,unas,0.000000,0.000000,0.011000,0.000000,0.000000,0.000000,0.000000


In [11]:
list_drop_train = ['id']

df_train.drop(list_drop_train,axis=1,inplace=True)

In [12]:
df_numeric = df_train.select_dtypes(include=[np.number])
df_cat = df_train.select_dtypes(exclude=[np.number])

In [13]:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

df_train[df_numeric.columns] = num_imputer.fit_transform(df_numeric)

df_train[df_cat.columns] = cat_imputer.fit_transform(df_cat)

In [14]:
for col in df_numeric.columns:
    if df_numeric[col].skew() > 1:  # Adjust the threshold as needed
        df_train[col] = np.log1p(df_train[col]) 

In [15]:
# for feature in df_numeric.columns:
#     # Check if the maximum value is greater than 10 times the median and greater than 10
#     if df_numeric[feature].max() > 10 * df_numeric[feature].median() and df_numeric[feature].max() > 10:
#         # Cap the values at the 95th percentile
#         df_train[feature] = np.where(df_train[feature] < df_numeric[feature].quantile(0.95), 
#                                       df_train[feature], 
#                                       df_numeric[feature].quantile(0.95))

In [16]:
Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3 - Q1

# Determine lower and upper limits
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Clip the values in the original DataFrame
for column in df_numeric.columns:
    df_train[column] = np.clip(df_train[column], lower_limit[column], upper_limit[column])

In [17]:
# from scipy import stats

# z_scores = np.abs(stats.zscore(df_numeric))
# threshold = 3  # Common threshold for Z-score
# df_train = df_train[(z_scores < threshold).all(axis=1)]

In [18]:
# for feature in df_cat.columns:
#     if df_cat[feature].nunique()>6:
#         df_train[feature] = np.where(df_train[feature].isin(df_train[feature].value_counts().head().index), df_train[feature], '-')

In [19]:
X = df_train.drop("attack_cat", axis = 1)
y = df_train["attack_cat"]

In [20]:
categorical_cols = ['state', 'service', 'proto']
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')
X = ct.fit_transform(X)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 0,
                                                    stratify=y)

In [22]:
# smote = SMOTE(random_state=0)
# X_resampled, y_resampled = smote.fit_resample(X, y)

In [23]:
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

In [24]:
var_threshold = 0.01
var_filter = VarianceThreshold(threshold=var_threshold)
X_train_filtered = var_filter.fit_transform(X_train)

In [25]:
selector = SelectKBest(score_func=f_classif, k=10)  # Select the top 10 features
X_selected = selector.fit_transform(X_train_filtered, y_train)

In [26]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.

In [27]:
power_transformer = PowerTransformer(method='yeo-johnson')
X_power_transformed = power_transformer.fit_transform(X_scaled)

In [28]:
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_power_transformed)

In [None]:
# models = {
#     'Decision Tree': DecisionTreeClassifier(criterion="entropy"),
#     'KNN': KNeighborsClassifier(n_neighbors=5, metric='euclidean'),
#     'Gaussian Naive Bayes': GaussianNB(),
# }

# f1_scores = {}

# # Train and evaluate each model
# for model_name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'macro' or 'micro' as needed
#     f1_scores[model_name] = f1

# # Step 13: Display F1 Scores
# print("F1 Scores:")
# for model_name, score in f1_scores.items():
#     print(f"{model_name}: {score:.4f}")

In [30]:
# from sklearn.model_selection import cross_val_score, StratifiedKFold

# cv_scores = {}

# # Set up Stratified K-Fold cross-validation
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# # Evaluate each model using cross-validation
# for model_name, model in models.items():
#     scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='f1_weighted')
#     cv_scores[model_name] = scores
#     print(f"{model_name} Cross-Validation F1 Scores: {scores}")
#     print(f"{model_name} Mean F1 Score: {scores.mean():.4f} ± {scores.std():.4f}")

# # Optionally, you can also find the best model based on mean F1 score
# best_model_name = max(cv_scores, key=lambda k: cv_scores[k].mean())
# print(f"Best Model: {best_model_name} with Mean F1 Score: {cv_scores[best_model_name].mean():.4f}")

In [31]:
# from sklearn.model_selection import GridSearchCV

# dt_model = DecisionTreeClassifier()

# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Set up GridSearchCV
# grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, 
#                            scoring='f1_weighted', cv=5, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best parameters and score
# print("Best parameters:", grid_search.best_params_)
# print("Best F1 Score:", grid_search.best_score_)

In [32]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train[:, 18:] = sc.fit_transform(X_train[:, 18:])
# X_test[:, 18:] = sc.transform(X_test[:, 18:])

In [33]:
from sklearn.metrics import classification_report, accuracy_score

In [34]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, knn_pred))
print("Classification Report: \n", classification_report(y_test, knn_pred))

Accuracy:  0.770252929938122
Classification Report: 
               precision    recall  f1-score   support

           -       0.55      0.59      0.57      3100
         DoS       0.32      0.34      0.33      2453
    Exploits       0.65      0.67      0.66      6679
     Fuzzers       0.61      0.60      0.61      3637
     Generic       1.00      0.98      0.99      8000
      Normal       0.91      0.88      0.89     11200

    accuracy                           0.77     35069
   macro avg       0.67      0.68      0.68     35069
weighted avg       0.78      0.77      0.77     35069



In [35]:
# from main import KNNScratch

# knn_scratch = KNNScratch(neighbors=5, metric='euclidean')
# knn_scratch.fit(X_train, y_train)
# knn_scratch_pred = knn_scratch.predict(X_test)
# print("Accuracy: ", accuracy_score(y_test, knn_scratch_pred))
# print("Classification Report: \n", classification_report(y_test, knn_scratch_pred))

In [36]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, gnb_pred))
print("Classification Report: \n", classification_report(y_test, gnb_pred))

Accuracy:  0.6875873278394024
Classification Report: 
               precision    recall  f1-score   support

           -       0.57      0.26      0.36      3100
         DoS       0.32      0.74      0.45      2453
    Exploits       0.69      0.41      0.51      6679
     Fuzzers       0.42      0.75      0.54      3637
     Generic       0.87      0.98      0.92      8000
      Normal       0.96      0.73      0.83     11200

    accuracy                           0.69     35069
   macro avg       0.64      0.64      0.60     35069
weighted avg       0.75      0.69      0.69     35069



In [37]:
# from main import GaussianNaiveBayesScratch

# gnb_scratch = GaussianNaiveBayesScratch()
# gnb_scratch.fit(X_train, y_train)
# gnb_scratch_pred = gnb_scratch.predict(X_test)
# print("Accuracy: ", accuracy_score(y_test, gnb_scratch_pred))
# print("Classification Report: \n", classification_report(y_test, gnb_scratch_pred))

In [38]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, dt_pred))
print("Classification Report: \n", classification_report(y_test, dt_pred))

Accuracy:  0.790726852775956
Classification Report: 
               precision    recall  f1-score   support

           -       0.65      0.63      0.64      3100
         DoS       0.32      0.33      0.33      2453
    Exploits       0.67      0.68      0.68      6679
     Fuzzers       0.67      0.65      0.66      3637
     Generic       0.99      0.98      0.98      8000
      Normal       0.91      0.91      0.91     11200

    accuracy                           0.79     35069
   macro avg       0.70      0.70      0.70     35069
weighted avg       0.79      0.79      0.79     35069



In [39]:
# from main import DecisionTreeClassifierScratch

# dt_scratch = DecisionTreeClassifierScratch()
# dt_scratch.fit(X_train, y_train)
# dt_scratch_pred = dt_scratch.predict(X_test)
# print("Accuracy: ", accuracy_score(y_test, dt_scratch_pred))
# print("Classification Report: \n", classification_report(y_test, dt_scratch_pred))