In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from scipy.sparse import csr_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [22]:
DATA_PATH = './data/'
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [23]:
NB15_FEATURES = 'UNSW-NB15_features-modified.csv'

In [24]:
metadata = pd.read_csv(DATA_PATH + TRAIN_PATH + NB15_FEATURES)

metadata

Unnamed: 0,No.,Name,Type,Description,feature_type
0,5,proto,nominal,Transaction protocol,flow
1,6,state,nominal,Indicates to the state and its dependent proto...,basic
2,7,dur,Float,Record total duration,basic
3,8,sbytes,Integer,Source to destination transaction bytes,basic
4,9,dbytes,Integer,Destination to source transaction bytes,basic
5,10,sttl,Integer,Source to destination time to live value,basic
6,11,dttl,Integer,Destination to source time to live value,basic
7,12,sloss,Integer,Source packets retransmitted or dropped,basic
8,13,dloss,Integer,Destination packets retransmitted or dropped,basic
9,14,service,nominal,"http, ftp, smtp, ssh, dns, ftp-data ,irc and ...",basic


In [25]:
ADDITIONAL_FEATURES_TRAIN = 'additional_features_train.csv'
BASIC_FEATURES_TRAIN = 'basic_features_train.csv'
CONTENT_FEATURES_TRAIN = 'content_features_train.csv'
FLOW_FEATURES_TRAIN = 'flow_features_train.csv'
LABELS_TRAIN = 'labels_train.csv'
TIME_FEATURES_TRAIN = 'time_features_train.csv'

In [26]:
dataset_paths_train = [
    ADDITIONAL_FEATURES_TRAIN,
    BASIC_FEATURES_TRAIN,
    CONTENT_FEATURES_TRAIN,
    FLOW_FEATURES_TRAIN,
    LABELS_TRAIN,
    TIME_FEATURES_TRAIN
]

In [27]:
all_data_train = [pd.read_csv(DATA_PATH + TRAIN_PATH + dp) for dp in dataset_paths_train]

df_train = all_data_train[0]

for df_i in all_data_train[1:]:
    df_train = pd.merge(df_train, df_i, on='id')

df_train.drop(["label"], axis=1, inplace=True)

df_train

Unnamed: 0,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,...,response_body_len,proto,attack_cat,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
0,0.0,0.0,0.0,0.0,0.0,11.0,,5.0,4.0,2.0,...,0.0,tcp,Normal,4449.110313,3234.831566,11.845558,6.261361,,0.000444,0.000114
1,0.0,2.0,0.0,0.0,0.0,10.0,10.0,10.0,10.0,,...,0.0,udp,Generic,0.000000,0.000000,0.009000,0.000000,0.000000,0.000000,
2,0.0,,0.0,0.0,0.0,4.0,4.0,2.0,2.0,1.0,...,0.0,tcp,Exploits,8561.040438,249.950547,165.386453,172.345750,0.158826,0.057902,0.100924
3,0.0,0.0,0.0,0.0,0.0,9.0,9.0,3.0,2.0,2.0,...,0.0,tcp,Normal,4053.086020,2918.730804,8.669644,4.496707,0.000558,0.000448,
4,0.0,0.0,0.0,0.0,0.0,3.0,3.0,4.0,3.0,1.0,...,0.0,udp,Normal,0.000000,0.000000,0.008000,0.007000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,0.0,2.0,0.0,0.0,0.0,38.0,38.0,32.0,,32.0,...,0.0,udp,Generic,0.000000,0.000000,0.005000,0.000000,0.000000,0.000000,0.000000
175337,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,3.0,,...,,tcp,Exploits,15464.286700,531.848125,193.445428,209.600766,0.066524,0.047738,0.018786
175338,0.0,2.0,0.0,,0.0,33.0,33.0,16.0,16.0,16.0,...,0.0,udp,Generic,0.000000,0.000000,0.008000,0.000000,0.000000,0.000000,0.000000
175339,0.0,2.0,0.0,0.0,0.0,36.0,36.0,,37.0,36.0,...,0.0,udp,Generic,0.000000,0.000000,0.004000,0.000000,0.000000,0.000000,


In [28]:
ADDITIONAL_FEATURES_TEST = 'additional_features_test.csv'
BASIC_FEATURES_TEST = 'basic_features_test.csv'
CONTENT_FEATURES_TEST = 'content_features_test.csv'
FLOW_FEATURES_TEST = 'flow_features_test.csv'
TIME_FEATURES_TEST = 'time_features_test.csv'

In [29]:
dataset_paths_test = [
    ADDITIONAL_FEATURES_TEST,
    BASIC_FEATURES_TEST,
    CONTENT_FEATURES_TEST,
    FLOW_FEATURES_TEST,
    TIME_FEATURES_TEST
]

In [30]:
all_data_test = [pd.read_csv(DATA_PATH + TEST_PATH + dp) for dp in dataset_paths_test]

df_test = all_data_test[0]

for df_i in all_data_test[1:]:
    df_test = pd.merge(df_test, df_i, on='id')

df_test

Unnamed: 0,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,...,trans_depth,response_body_len,proto,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
0,0.0,1.0,0.0,0.0,0.0,5.0,5.0,2.0,2.0,2.0,...,0.0,0.0,tcp,2737.954123,118.833969,48.756556,76.593602,0.165117,0.072001,0.093116
1,0.0,1.0,0.0,,0.0,6.0,6.0,1.0,1.0,1.0,...,0.0,0.0,tcp,2938.299144,165.780563,49.812539,109.557602,0.223604,0.100248,0.123356
2,0.0,1.0,0.0,0.0,0.0,4.0,4.0,1.0,2.0,1.0,...,0.0,0.0,tcp,4287.453629,129.471406,69.765530,94.395906,0.113189,0.082498,0.030691
3,0.0,2.0,0.0,0.0,0.0,10.0,4.0,2.0,4.0,2.0,...,0.0,0.0,udp,0.000000,0.000000,0.001000,0.000000,0.000000,0.000000,0.000000
4,,0.0,,0.0,0.0,13.0,11.0,10.0,7.0,6.0,...,0.0,0.0,tcp,1119.063538,26.748141,17.628799,15.543294,0.000655,0.000526,0.000129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20578,0.0,1.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,...,0.0,0.0,tcp,1902.551328,143.722203,16.088482,87.730055,0.117474,0.071119,0.046355
20579,0.0,0.0,0.0,0.0,0.0,14.0,7.0,1.0,9.0,1.0,...,0.0,0.0,tcp,858.139465,22.118752,14.832222,13.803000,0.000647,0.000491,0.000156
20580,0.0,2.0,0.0,0.0,0.0,6.0,6.0,,4.0,4.0,...,0.0,0.0,udp,0.000000,0.000000,,0.000000,0.000000,,0.000000
20581,0.0,2.0,0.0,0.0,0.0,11.0,11.0,4.0,8.0,4.0,...,0.0,0.0,unas,0.000000,0.000000,0.011000,0.000000,0.000000,0.000000,0.000000


In [31]:
list_drop_train = ['id']

df_train.drop(list_drop_train,axis=1,inplace=True)

In [32]:
df_numeric = df_train.select_dtypes(include=[np.number])
df_cat = df_train.select_dtypes(exclude=[np.number])

In [33]:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

df_train[df_numeric.columns] = num_imputer.fit_transform(df_numeric)

df_train[df_cat.columns] = cat_imputer.fit_transform(df_cat)

In [34]:
for col in df_numeric.columns:
    if df_numeric[col].skew() > 1:  # Adjust the threshold as needed
        df_train[col] = np.log1p(df_train[col]) 

In [35]:
# for feature in df_numeric.columns:
#     # Check if the maximum value is greater than 10 times the median and greater than 10
#     if df_numeric[feature].max() > 10 * df_numeric[feature].median() and df_numeric[feature].max() > 10:
#         # Cap the values at the 95th percentile
#         df_train[feature] = np.where(df_train[feature] < df_numeric[feature].quantile(0.95), 
#                                       df_train[feature], 
#                                       df_numeric[feature].quantile(0.95))

In [36]:
Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3 - Q1

# Determine lower and upper limits
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Clip the values in the original DataFrame
for column in df_numeric.columns:
    df_train[column] = np.clip(df_train[column], lower_limit[column], upper_limit[column])

In [37]:
# from scipy import stats

# z_scores = np.abs(stats.zscore(df_numeric))
# threshold = 3  # Common threshold for Z-score
# df_train = df_train[(z_scores < threshold).all(axis=1)]

In [38]:
# for feature in df_cat.columns:
#     if df_cat[feature].nunique()>6:
#         df_train[feature] = np.where(df_train[feature].isin(df_train[feature].value_counts().head().index), df_train[feature], '-')

In [39]:
X = df_train.drop("attack_cat", axis = 1)
y = df_train["attack_cat"]

In [40]:
categorical_cols = ['state', 'service', 'proto']
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output=False), categorical_cols)], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [41]:
# smote = SMOTE(random_state=0)
# X_resampled, y_resampled = smote.fit_resample(X, y)

In [42]:
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

In [43]:
var_threshold = 0.01
var_filter = VarianceThreshold(threshold=var_threshold)
X_train_filtered = var_filter.fit_transform(X)

In [44]:
selector = SelectKBest(score_func=f_classif, k=20)
X_selected = selector.fit_transform(X_train_filtered, y )

In [45]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

In [46]:
# power_transformer = PowerTransformer(method='yeo-johnson')  # You can also use 'box-cox' if all data is positive
# X_power_transformed = power_transformer.fit_transform(X_scaled)

In [47]:
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, 
                                                    test_size = 0.2, 
                                                    random_state = 0,
                                                    stratify=y)

In [49]:
# models = {
#     'Decision Tree': DecisionTreeClassifier(criterion="entropy"),
#     'KNN': KNeighborsClassifier(n_neighbors=5, metric='euclidean'),
#     'Gaussian Naive Bayes': GaussianNB(),
# }

# f1_scores = {}

# # Train and evaluate each model
# for model_name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'macro' or 'micro' as needed
#     f1_scores[model_name] = f1

# # Step 13: Display F1 Scores
# print("F1 Scores:")
# for model_name, score in f1_scores.items():
#     print(f"{model_name}: {score:.4f}")

In [50]:
# from sklearn.model_selection import cross_val_score, StratifiedKFold

# cv_scores = {}

# # Set up Stratified K-Fold cross-validation
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# # Evaluate each model using cross-validation
# for model_name, model in models.items():
#     scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='f1_weighted')
#     cv_scores[model_name] = scores
#     print(f"{model_name} Cross-Validation F1 Scores: {scores}")
#     print(f"{model_name} Mean F1 Score: {scores.mean():.4f} ± {scores.std():.4f}")

# # Optionally, you can also find the best model based on mean F1 score
# best_model_name = max(cv_scores, key=lambda k: cv_scores[k].mean())
# print(f"Best Model: {best_model_name} with Mean F1 Score: {cv_scores[best_model_name].mean():.4f}")

In [51]:
# from sklearn.model_selection import GridSearchCV

# dt_model = DecisionTreeClassifier()

# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Set up GridSearchCV
# grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, 
#                            scoring='f1_weighted', cv=5, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best parameters and score
# print("Best parameters:", grid_search.best_params_)
# print("Best F1 Score:", grid_search.best_score_)

In [52]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train[:, 18:] = sc.fit_transform(X_train[:, 18:])
# X_test[:, 18:] = sc.transform(X_test[:, 18:])

In [53]:
from collections import Counter

class KNNScratch:
    def __init__(self, neighbors=5, metric='euclidean', p=2):
        self.neighbors = neighbors
        self.metric = metric
        self.p = p
        
    def fit(self, X_train, y_train):
        # Convert to NumPy arrays if they are not already
        self.X_train = X_train.to_numpy() if hasattr(X_train, 'to_numpy') else X_train
        self.y_train = y_train.to_numpy() if hasattr(y_train, 'to_numpy') else y_train

    def distance(self, x1, x2):
        if self.metric == 'euclidean':
            return np.sqrt(np.sum(np.square(x1 - x2))) 
        elif self.metric == 'manhattan':
            return np.sum(np.abs(x1 - x2)) 
        elif self.metric == 'minkowski':
            return np.sum(np.abs(x1 - x2) ** self.p) ** (1 / self.p)
        else:
            raise ValueError("Unsupported metric: choose 'euclidean', 'manhattan', or 'minkowski'.")

    def predict(self, X_test):
        X_test = X_test.to_numpy() if hasattr(X_test, 'to_numpy') else X_test
        
        predictions = []

        for x_test in X_test:
            distances = [self.distance(x_test, x_train) for x_train in self.X_train]
            sorted_distances_indexes = np.argsort(distances)[:self.neighbors]
            k_nearest = [self.y_train[i] for i in sorted_distances_indexes]
            most_class = Counter(k_nearest).most_common(1)
            predictions.append(most_class[0][0])
        
        return np.array(predictions)

class GaussianNaiveBayesScratch:
    def gauss_dist(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = np.zeros((len(self.classes), X.shape[1]))
        self.var = np.zeros((len(self.classes), X.shape[1]))
        self.priors = np.zeros(len(self.classes))

        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.mean[idx, :] = X_c.mean(axis=0)
            self.var[idx, :] = X_c.var(axis=0)
            self.priors[idx] = X_c.shape[0] / X.shape[0]

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        likelihoods = []
        for idx, c in enumerate(self.classes):
            prior = np.log(self.priors[idx])
            likelihood = np.sum(np.log(self.gauss_dist(idx, x)))
            likelihood += prior
            likelihoods.append(likelihood)
        return self.classes[np.argmax(likelihoods)]

In [54]:
from sklearn.metrics import classification_report, accuracy_score

In [55]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=100, metric='euclidean')

knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, knn_pred))
print("Classification Report: \n", classification_report(y_test, knn_pred))

Accuracy:  0.7739028771849782
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.60      0.06      0.11       400
      Backdoor       0.00      0.00      0.00       349
           DoS       0.35      0.13      0.19      2453
      Exploits       0.59      0.86      0.70      6679
       Fuzzers       0.57      0.64      0.61      3637
       Generic       0.99      0.97      0.98      8000
        Normal       0.92      0.86      0.89     11200
Reconnaissance       0.63      0.58      0.60      2098
     Shellcode       0.17      0.02      0.03       227
         Worms       0.00      0.00      0.00        26

      accuracy                           0.77     35069
     macro avg       0.48      0.41      0.41     35069
  weighted avg       0.76      0.77      0.76     35069



In [227]:
from main import KNNScratch

knn_scratch = KNNScratch(neighbors=5, metric='euclidean')
knn_scratch.fit(X_train, y_train)
knn_scratch_pred = knn_scratch.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, knn_scratch_pred))
print("Classification Report: \n", classification_report(y_test, knn_scratch_pred))

In [228]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, gnb_pred))
print("Classification Report: \n", classification_report(y_test, gnb_pred))

Accuracy:  0.6782343380193333
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       400
      Backdoor       0.00      0.00      0.00       349
           DoS       0.33      0.71      0.45      2453
      Exploits       0.60      0.49      0.54      6679
       Fuzzers       0.41      0.63      0.50      3637
       Generic       1.00      0.93      0.96      8000
        Normal       0.89      0.76      0.82     11200
Reconnaissance       0.33      0.22      0.26      2098
     Shellcode       0.10      0.13      0.11       227
         Worms       0.09      0.12      0.10        26

      accuracy                           0.68     35069
     macro avg       0.38      0.40      0.38     35069
  weighted avg       0.71      0.68      0.68     35069



In [229]:
from main import GaussianNaiveBayesScratch

gnb_scratch = GaussianNaiveBayesScratch()
gnb_scratch.fit(X_train, y_train)
gnb_scratch_pred = gnb_scratch.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, gnb_scratch_pred))
print("Classification Report: \n", classification_report(y_test, gnb_scratch_pred))

Accuracy:  0.6782343380193333
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       400
      Backdoor       0.00      0.00      0.00       349
           DoS       0.33      0.71      0.45      2453
      Exploits       0.60      0.49      0.54      6679
       Fuzzers       0.41      0.63      0.50      3637
       Generic       1.00      0.93      0.96      8000
        Normal       0.89      0.76      0.82     11200
Reconnaissance       0.33      0.22      0.26      2098
     Shellcode       0.10      0.13      0.11       227
         Worms       0.09      0.12      0.10        26

      accuracy                           0.68     35069
     macro avg       0.38      0.40      0.38     35069
  weighted avg       0.71      0.68      0.68     35069



In [230]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, dt_pred))
print("Classification Report: \n", classification_report(y_test, dt_pred))

Accuracy:  0.7463286663434943
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.18      0.13      0.15       400
      Backdoor       0.07      0.04      0.05       349
           DoS       0.28      0.29      0.28      2453
      Exploits       0.62      0.67      0.64      6679
       Fuzzers       0.58      0.54      0.56      3637
       Generic       0.98      0.98      0.98      8000
        Normal       0.88      0.88      0.88     11200
Reconnaissance       0.64      0.60      0.62      2098
     Shellcode       0.27      0.23      0.25       227
         Worms       0.03      0.04      0.04        26

      accuracy                           0.75     35069
     macro avg       0.45      0.44      0.44     35069
  weighted avg       0.74      0.75      0.74     35069



In [231]:
from sklearn.model_selection import KFold, cross_val_score

# Set up k-fold cross-validation
k = 5  # You can change this to any number of folds you prefer
kf = KFold(n_splits=k, shuffle=True, random_state=0)

# Perform cross-validation
cv_scores = cross_val_score(dt, X, y, cv=kf)

# Print the results
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Scores: {np.std(cv_scores)}")

Cross-Validation Scores: [0.79018506 0.78721341 0.78858218 0.78758412 0.78658606]
Mean CV Score: 0.7880301689000585
Standard Deviation of CV Scores: 0.0012569633513535383


In [232]:
from main import DecisionTreeClassifierScratch

dt_scratch = DecisionTreeClassifierScratch()
dt_scratch.fit(X_train, y_train)
dt_scratch_pred = dt_scratch.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, dt_scratch_pred))
print("Classification Report: \n", classification_report(y_test, dt_scratch_pred))

KeyboardInterrupt: 