In [1]:
import pandas as pd
import numpy as np
from functools import reduce

basic_df = pd.read_csv('https://drive.google.com/uc?id=1Dzu0PazbrVpG7-hQC2Hhq2mxzApN8fYV')
basic_df.columns = basic_df.columns.str.strip()
additional_df = pd.read_csv('https://drive.google.com/uc?id=123EJxDLw9PFmuNsOQVNTaJIEFHZ_U5A3')
additional_df.columns = additional_df.columns.str.strip()
feature_df = pd.read_csv('https://drive.google.com/uc?id=1BSXjJibF6fJEir6_FoqBVs7c_66l3s6n')
feature_df.columns = feature_df.columns.str.strip()
label_df = pd.read_csv('https://drive.google.com/uc?id=1S3PPHmstBMERnUwQIdyBo6bzFi-Dm4oM')
label_df.columns = label_df.columns.str.strip()
proto_df = pd.read_csv('https://drive.google.com/uc?id=1AW15tOqt_eDHoTKSg1xcE6eeAGE1c1yP')
proto_df.columns = proto_df.columns.str.strip()
content_df = pd.read_csv('https://drive.google.com/uc?id=1cxtyP4EOIfIAfdsDDP34aA1cnidfsim4')
content_df.columns = content_df.columns.str.strip()
time_df = pd.read_csv('https://drive.google.com/uc?id=1pnYsyi20SO70IrEuRG7jHuM6N7jbi1WA')
time_df.columns = time_df.columns.str.strip()
dfs = [proto_df, basic_df, additional_df, label_df, content_df, time_df]
merged = reduce(lambda left, right: pd.merge(left, right, on='id', how='inner'), dfs)

merged.head()

Unnamed: 0,proto,id,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,...,dmean,trans_depth,response_body_len,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
0,tcp,0,FIN,2.736664,13350.0,548216.0,31.0,29.0,21.0,197.0,...,,0.0,0.0,4449.110313,3234.831566,11.845558,6.261361,,0.000444,0.000114
1,udp,1,INT,9e-06,114.0,0.0,254.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.009,0.0,0.0,0.0,
2,tcp,2,FIN,5.788526,6102.0,3892.0,254.0,252.0,13.0,13.0,...,114.0,0.0,0.0,8561.040438,249.950547,165.386453,172.34575,0.158826,0.057902,0.100924
3,tcp,3,FIN,3.849634,25160.0,,31.0,29.0,38.0,390.0,...,1276.0,0.0,0.0,4053.08602,2918.730804,8.669644,4.496707,0.000558,0.000448,
4,udp,4,CON,0.001052,130.0,162.0,,29.0,0.0,0.0,...,81.0,0.0,0.0,0.0,0.0,0.008,0.007,0.0,0.0,0.0


In [21]:
from typing import Any, List, Callable, Tuple
from heapq import heappush, heappop
import concurrent.futures

MatrixLike = List[List[Any]]
ArrayLike = List[Any]

class KNeighborsClassifierFromScratch():
    n_neighbors: int
    distance_func: Callable
    X: MatrixLike
    y: MatrixLike | ArrayLike
    _categorical_column_indexes: List[int]
    _is_categorical_list: List[bool]
    algorithm: str

    def __init__(
        self,
        n_neighbors: int = 5,
        categorical_column_indexes: List[int] = [],
        algorithm: str = "euclidean" # euclidean, manhattan, minkowski
    ) -> None:
        self.n_neighbors = n_neighbors
        self._categorical_column_indexes = categorical_column_indexes
        self._is_categorical_list: List[bool] = []
        self.algorithm = algorithm

    def fit(self, X: MatrixLike, y: MatrixLike | ArrayLike) -> 'KNeighborsClassifierFromScratch':
        self.X = X
        self.y = y
        self._is_categorical_list = [i in self._categorical_column_indexes for i in range(len(X[0]))] # instead of just [2, 4], we do [False, False, True, False, True, False, False] (assuming len(X[0]) == 7)
        return self
    
    def euclidean_distance(self, feature: ArrayLike, group: ArrayLike) -> float:
        distance = 0
        for i in range(len(feature)):
            if isinstance(feature[i], str) or isinstance(group[i], str): continue
            distance += (feature[i] - group[i]) ** 2
        # distance = distance ** 0.5
        return distance
    
    def manhattan_distance(self, feature: ArrayLike, group: ArrayLike) -> float:
        distance = 0
        for i in range(len(feature)):
            if isinstance(feature[i], str) or isinstance(group[i], str): continue
            distance += abs(feature[i] - group[i])
        return distance
    
    def minkowski_distance(self, feature: ArrayLike, group: ArrayLike, p: float = 3) -> float:
        distance = 0
        for i in range(len(feature)):
            if isinstance(feature[i], str) or isinstance(group[i], str): continue
            distance += abs(feature[i] - group[i]) ** p
        # distance = distance ** 1/p
        return distance

    def predict(self, X: MatrixLike) -> ArrayLike:
        predictions: List[any] = []

        distance_func = self.euclidean_distance
        if self.algorithm == "manhattan":
            distance_func = self.manhattan_distance
        elif self.algorithm == "minkowski":
            distance_func = self.minkowski_distance
        self.distance_func = distance_func
        
        progress = 0
        max_progress = len(X)
        for new_row in X:
            progress += 1
            print(f"\rProgress: {progress}/{max_progress}|{progress/max_progress}", end="")


            distance_list = []
            for i, dataset_row in enumerate(self.X):
                distance = distance_func(new_row, dataset_row)
                heappush(distance_list, (distance, dataset_row, self.y[i]))
                if len(distance_list) > self.n_neighbors:
                    heappop(distance_list)
            group_by = {}
            for distance, row, classification in distance_list:
                if classification not in group_by:
                    group_by[classification] = 0
                group_by[classification] += 1
            
            max_group = None
            max_count = 0
            for classification, count in group_by.items():
                if count > max_count:
                    max_count = count
                    max_group = classification

            predictions.append(max_group)
        
        print()

        return predictions

    def process_row(self, new_row):
        distance_func = self.distance_func
        distance_list = []
        for i, dataset_row in enumerate(self.X):
            distance = distance_func(new_row, dataset_row)
            heappush(distance_list, (distance, dataset_row, self.y[i]))
            if len(distance_list) > self.n_neighbors:
                heappop(distance_list)
        
        group_by = {}
        for distance, row, classification in distance_list:
            if classification not in group_by:
                group_by[classification] = 0
            group_by[classification] += 1
        
        max_group = None
        max_count = 0
        for classification, count in group_by.items():
            if count > max_count:
                max_count = count
                max_group = classification
        
        return max_group
        
    def predict_multiprocessing(self, X: MatrixLike) -> ArrayLike:
        predictions: List[any] = []

        distance_func = self.euclidean_distance
        if self.algorithm == "manhattan":
            distance_func = self.manhattan_distance
        elif self.algorithm == "minkowski":
            distance_func = self.minkowski_distance
        self.distance_func = distance_func


        with concurrent.futures.ProcessPoolExecutor() as executor:
            predictions = list(executor.map(self.process_row, X))

        return predictions
    



In [3]:
X_data = [
    [1, "sunny", 3, 1, "hot"],
    [2, "sunny", 2, 2, "hot"],
    [1, "rainy", 1, 4, "cool"],
    [2, "rainy", 1, 3, "cool"],
    [4, "rainy", 3, 5, "hot"],
    [2, "rainy", 1, 4, "cool"],
]
y_data = [
    "no",
    "no",
    "yes",
    "yes",
    "no",
    "yes",
]

new_data = [
    [1, "rainy", 1, 4, "cool"],
    [5, "sunny", 2, 3, "cool"],
]

categorical_column_indexes = [1, 4]

In [4]:
knn = KNeighborsClassifierFromScratch(n_neighbors=5, categorical_column_indexes=categorical_column_indexes)
knn.fit(X=X_data, y=y_data)
predictions = knn.predict(X=new_data)
print(predictions)   

Progress: 2/2|1.0
['no', 'yes']


In [5]:
import pickle
pickle.dump(knn, open("knn.pkl", 'wb'))

In [6]:
knn_loaded: KNeighborsClassifierFromScratch = pickle.load(open("knn.pkl", 'rb')) 
predictions = knn_loaded.predict(X=new_data) 
print(predictions)

Progress: 2/2|1.0
['no', 'yes']


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numbers

def get_sklearn_prediction(X_data=X_data, y_data=y_data, new_data=new_data, categorical_column_indexes=categorical_column_indexes, n_neighbors=5):
    label_encoders = {}
    X_data_encoded = np.array(X_data, dtype=object)
    new_data_encoded = np.array(new_data, dtype=object)
    # print("X_data_encoded:", X_data_encoded)
    # print("new_data_encoded:", new_data_encoded)

    imputer = SimpleImputer(strategy="most_frequent")
    X_data_encoded = imputer.fit_transform(X_data_encoded)
    new_data_encoded = imputer.transform(new_data_encoded)

    for col in categorical_column_indexes:
        le = LabelEncoder()
        X_data_encoded[:, col] = le.fit_transform(X_data_encoded[:, col])
        new_data_encoded[:, col] = le.transform(new_data_encoded[:, col])
        label_encoders[col] = le

    y_encoder = LabelEncoder()
    y_data_encoded = y_encoder.fit_transform(y_data)

    knn_sklearn = KNeighborsClassifier(n_neighbors=n_neighbors, metric="manhattan")
    knn_sklearn.fit(X_data_encoded, y_data_encoded)

    predictions_encoded = knn_sklearn.predict(new_data_encoded)
    predictions = y_encoder.inverse_transform(predictions_encoded)
    return predictions

def get_sklearn_prediction_onehot(X_data, y_data, new_data, categorical_column_indexes, n_neighbors=5):
    imputer = SimpleImputer(strategy="most_frequent")
    onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', imputer, [i for i in range(X_data.shape[1]) if i not in categorical_column_indexes]),  # Apply imputer on non-categorical
            ('cat', onehot_encoder, categorical_column_indexes)  # Apply OneHotEncoder on categorical columns
        ]
    )
    
    knn_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier(n_neighbors=n_neighbors, metric="manhattan"))
    ])
    
    knn_pipeline.fit(X_data, y_data)
    predictions = knn_pipeline.predict(new_data)
    return predictions



print(get_sklearn_prediction())


['yes' 'yes']


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from functools import reduce

In [9]:
# Split training set and validation set here, store into variables train_set and val_set.
# Remember to also keep the original training set before splitting. This will come important later.

from sklearn.model_selection import train_test_split

test_size = 0.2
random_state = 43

# Split the data into training and validation sets
train_df, val_df = train_test_split(merged, test_size=test_size, random_state=random_state)

# Optionally, inspect the split
print(f"Original dataset size: {len(merged)}")
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

Original dataset size: 175341
Training set size: 140272
Validation set size: 35069


In [10]:
train_df

Unnamed: 0,proto,id,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,...,dmean,trans_depth,response_body_len,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
55598,ospf,55598,REQ,58.119190,17408.0,0.0,254.0,0.0,0.0,0.0,...,0.0,,,1042.675000,0.000000,921.043312,0.000000,0.000000,0.000000,0.000000
84062,tcp,84062,FIN,0.006253,3936.0,2456.0,31.0,29.0,7.0,7.0,...,136.0,0.0,0.0,14.209823,0.481391,0.283765,0.360187,0.000616,0.000487,0.000129
36107,udp,36107,INT,0.000012,104.0,0.0,254.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.012000,0.000000,0.000000,,0.000000
74113,tcp,74113,FIN,0.899680,544.0,,254.0,252.0,2.0,1.0,...,44.0,0.0,,5712.086967,164.971937,97.180000,108.220141,0.247478,0.142138,0.105340
175165,tcp,175165,FIN,0.698121,588.0,354.0,254.0,252.0,,1.0,...,44.0,,0.0,4882.868634,147.099203,73.837111,89.634711,0.137728,0.070676,0.067052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129082,tcp,129082,FIN,0.205702,866.0,268.0,254.0,252.0,2.0,1.0,...,45.0,1.0,0.0,1116.137984,62.388355,21.158444,39.106199,0.058383,0.010162,0.048221
125205,udp,125205,INT,,114.0,0.0,254.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.005000,0.000000,0.000000,0.000000,0.000000
40753,tcp,40753,FIN,0.018372,1540.0,1644.0,31.0,29.0,4.0,4.0,...,91.0,0.0,0.0,73.033493,,1.204400,1.053529,0.000588,0.000459,0.000129
150848,tcp,150848,FIN,0.106734,3728.0,5474.0,31.0,29.0,7.0,7.0,...,228.0,0.0,0.0,340.248166,11.673940,,4.610957,0.000676,,0.000122


In [11]:
categorical_feats = ['proto', 'state', 'service', 'is_sm_ips_ports', 'is_ftp_login', 'attack_cat', 'label']
numeric_feats = ['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'sjit', 'djit', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'ackdat', 'ct_state_ttl', 'ct_flw_http_mthd', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm']
feats = categorical_feats + numeric_feats

def get_index_of_columns(df: pd.DataFrame, columns: List[str]) -> List[int]:
    return [df.columns.get_loc(col) for col in columns]

categorical_column_indexes = get_index_of_columns(train_df[feats], categorical_feats)
print(categorical_column_indexes)

[0, 1, 2, 3, 4, 5, 6]


In [12]:
# Write your code here

from sklearn.impute import SimpleImputer

def impute_missing_values(df, verbose=True, numeric_cols=[], categorical_cols=[]):
    processed_df = df.copy()

    if len(numeric_cols) == 0:
        numeric_cols = processed_df.select_dtypes(include=np.number).columns
        categorical_cols = processed_df.select_dtypes(exclude=np.number).columns

    for col in numeric_cols:
        if processed_df[col].isnull().any():
            skewness = processed_df[col].skew()
            if abs(skewness) < 1:
                strategy = 'mean'
            else:
                strategy = 'median'

            imputer = SimpleImputer(strategy=strategy)
            processed_df[col] = imputer.fit_transform(processed_df[[col]]).ravel()

            if verbose:
                print(f"Imputed numerical column '{col}' using {strategy}.")

    for col in categorical_cols:
        if processed_df[col].isnull().any():
            imputer = SimpleImputer(strategy='most_frequent')
            processed_df[col] = imputer.fit_transform(processed_df[[col]]).ravel()

            if verbose:
                print(f"Imputed categorical column '{col}' using most_frequent.")

    return processed_df



In [13]:
print(train_df.isnull().sum())

proto                7089
id                      0
state                7062
dur                  6950
sbytes               6892
dbytes               7097
sttl                 7106
dttl                 6953
sloss                7056
dloss                7199
service              7022
sload                7061
dload                7122
spkts                6879
dpkts                6906
is_sm_ips_ports      7102
ct_state_ttl         6918
ct_flw_http_mthd     6955
is_ftp_login         6885
ct_ftp_cmd           7112
ct_srv_src           7119
ct_srv_dst           7039
ct_dst_ltm           6960
ct_src_ltm           7032
ct_src_dport_ltm     7039
ct_dst_sport_ltm     7077
ct_dst_src_ltm       7152
attack_cat              0
label                   0
swin                 6947
dwin                 6994
stcpb                6911
dtcpb                7039
smean                6994
dmean                7103
trans_depth          7030
response_body_len    6974
sjit                 6984
djit        

In [14]:
train_df = impute_missing_values(train_df, False, numeric_feats, categorical_feats)
print(train_df.isnull().sum())

proto                0
id                   0
state                0
dur                  0
sbytes               0
dbytes               0
sttl                 0
dttl                 0
sloss                0
dloss                0
service              0
sload                0
dload                0
spkts                0
dpkts                0
is_sm_ips_ports      0
ct_state_ttl         0
ct_flw_http_mthd     0
is_ftp_login         0
ct_ftp_cmd           0
ct_srv_src           0
ct_srv_dst           0
ct_dst_ltm           0
ct_src_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
attack_cat           0
label                0
swin                 0
dwin                 0
stcpb                0
dtcpb                0
smean                0
dmean                0
trans_depth          0
response_body_len    0
sjit                 0
djit                 0
sinpkt               0
dinpkt               0
tcprtt               0
synack               0
ackdat     

In [22]:
val_df_no_attack_cat = val_df.drop(columns=['attack_cat'])
val_df_attack_cat = val_df['attack_cat']
train_df_no_attack_cat = train_df.drop(columns=['attack_cat'])
attack_cat_df = train_df['attack_cat']
categorical_feats_no_attack_cat = categorical_feats.copy(); categorical_feats_no_attack_cat.remove('attack_cat')
train_df_categorical_column_indexes = get_index_of_columns(train_df_no_attack_cat, categorical_feats_no_attack_cat)
print(train_df_no_attack_cat.values.tolist()[0])
print(attack_cat_df.values.tolist()[0])
print(val_df_no_attack_cat.values.tolist()[0])
print(train_df_categorical_column_indexes)

['ospf', 55598, 'REQ', 58.11919, 17408.0, 0.0, 254.0, 0.0, 0.0, 0.0, '-', 2358.739014, 0.0, 64.0, 0.0, 0.0, 6.0, 0.0, 0.0, 0.0, 1.0, 1.0, 19.0, 20.0, 1.0, 1.0, 21.0, 1, 0.0, 0.0, 0.0, 0.0, 272.0, 0.0, 0.0, 0.0, 1042.675, 0.0, 921.043312, 0.0, 0.0, 0.0, 0.0]
Exploits
['tcp', 80109, 'FIN', 0.550909, 796.0, 652.0, 254.0, 252.0, 3.0, 2.0, 'smtp', 10600.66211, 8524.09375, 12.0, nan, 0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1, 255.0, 255.0, 3369757399.0, 2383282739.0, 66.0, 65.0, 0.0, 0.0, 3276.020352, 111.681938, 50.082636, 58.280445, 0.082811, 0.025384, nan]
[0, 2, 10, 15, 18, 27]


In [16]:
print(len(train_df_no_attack_cat.values.tolist()))
print(len(attack_cat_df.values.tolist()))
print(len(val_df_no_attack_cat.values.tolist()))

140272
140272
35069


In [17]:
knn = KNeighborsClassifierFromScratch(n_neighbors=200, categorical_column_indexes=train_df_categorical_column_indexes)
knn.fit(X=train_df_no_attack_cat.values.tolist()[:100], y=attack_cat_df.values.tolist()[:100])
predictions = knn.predict(val_df_no_attack_cat.values.tolist())
print(predictions)

Progress: 35069/35069|1.09997148478713397
['Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal

In [18]:
# knn = KNeighborsClassifierFromScratch(n_neighbors=3, categorical_column_indexes=train_df_categorical_column_indexes)
# knn.fit(X=train_df_no_attack_cat.values.tolist()[:10], y=attack_cat_df.values.tolist()[:10])
# predictions = knn.predict_multiprocessing(val_df_no_attack_cat.values.tolist()[:10])
# print(predictions)

In [23]:
# sklearn_prediction = get_sklearn_prediction(
#     X_data=train_df_no_attack_cat.values.tolist(),
#     y_data=attack_cat_df.values.tolist(),
#     new_data=val_df_no_attack_cat.values.tolist(),
#     categorical_column_indexes=train_df_categorical_column_indexes
# )
# print(sklearn_prediction)

sklearn_prediction = get_sklearn_prediction_onehot(
    X_data=train_df_no_attack_cat,
    y_data=attack_cat_df,
    new_data=val_df_no_attack_cat.values.tolist(),
    categorical_column_indexes=train_df_categorical_column_indexes
)


# import numpy as np
# from sklearn.neighbors import KNeighborsClassifier
# import numbers

# def get_sklearn_prediction_custom_distance(X_data, y_data, new_data, categorical_column_indexes, n_neighbors=5):
#     # Split the data into numerical and categorical features
#     X_data_numeric = X_data.drop(columns=categorical_column_indexes)
#     X_data_categorical = X_data.iloc[:, categorical_column_indexes]
    
#     new_data_numeric = new_data.drop(columns=categorical_column_indexes)
#     new_data_categorical = new_data.iloc[:, categorical_column_indexes]
    
#     # Define the custom distance function
#     def metric(a, b):
#         distance = 0
#         for i in range(len(a)):
#             if isinstance(a[i], numbers.Number) and isinstance(b[i], numbers.Number):
#                 # Euclidean distance for numerical values
#                 distance += (a[i] - b[i]) ** 2
#             else:
#                 # For categorical values, return 1 if different, 0 if the same
#                 distance += 0 if a[i] == b[i] else 1
#         return np.sqrt(distance)  # Return the Euclidean distance as a scalar

#     # Combine both numerical and categorical data
#     X_combined = np.hstack((X_data_numeric.values, X_data_categorical.values))
#     new_data_combined = np.hstack((new_data_numeric.values, new_data_categorical.values))
    
#     # Create a KNeighborsClassifier with the custom distance function
#     knn_sklearn = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric)
#     knn_sklearn.fit(X_combined, y_data)
#     predictions = knn_sklearn.predict(new_data_combined)

#     return predictions

# Example usage:
# Assuming train_df_no_attack_cat, attack_cat_df, and val_df_no_attack_cat are your datasets
# Ensure categorical_column_indexes is a list of indices of categorical columns

# sklearn_prediction = get_sklearn_prediction_custom_distance(
#     X_data=train_df_no_attack_cat,
#     y_data=attack_cat_df,
#     new_data=val_df_no_attack_cat.values.tolist()[:50],
#     categorical_column_indexes=train_df_categorical_column_indexes
# )

print(sklearn_prediction)





['Exploits' 'Normal' 'Normal' ... 'Exploits' 'Exploits' 'Generic']


In [20]:
# from sklearn.metrics import f1_score
# from sklearn.preprocessing import LabelEncoder

# def get_score(predictions: List[any], reality: List[any]) -> float:
#     predictions = list(predictions)
#     reality = list(reality)
#     label_encoder = LabelEncoder()
#     all_labels = list(set(reality + predictions))
#     label_encoder.fit(all_labels)

#     encoded_predictions = label_encoder.transform(predictions)
#     encoded_reality = label_encoder.transform(reality)

#     macro_f1 = f1_score(encoded_reality, encoded_predictions, average='macro')
#     return macro_f1

# def count_differences(predictions: List[any], reality: List[any]) -> int:
#     count = 0
#     for i in range(len(predictions)):
#         if predictions[i] != reality[i]:
#             count += 1
#     return count, (1 - count / len(predictions))

# print(get_score(predictions=predictions, reality=sklearn_prediction))
# print(count_differences(predictions=predictions, reality=sklearn_prediction))