In [1]:
import pandas as pd 
import numpy as np 
import arff
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
with open("../datasets/NSL-KDD/KDDTrain+.arff") as arch:
    KDD_Train = arff.load(arch)

KDD_Train.keys()

dict_keys(['description', 'relation', 'attributes', 'data'])

In [3]:
data = KDD_Train["data"]

In [4]:
val_columns = [val[0] for val in KDD_Train["attributes"]]

In [5]:
columns = KDD_Train["attributes"]

In [6]:
df = pd.DataFrame(data=data,columns=val_columns)

In [7]:
y = df[["class"]]
df_num = df.select_dtypes(include=['float64', 'int64'])
df_obj = df.drop(columns="class").select_dtypes(include="object")

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

obj_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

full_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_pipeline,df_num.columns),
        ('cat', obj_pipeline,df_obj.columns)
    ])

In [8]:
def data_processing(df_num1,df_obj1):
    df = pd.concat((df_num1,df_obj1,y),axis=1)
    train_set, test_set = train_test_split(df,train_size=0.6,random_state=42)
    x_train = train_set.iloc[:,:-1]
    y_train = train_set.iloc[:,-1]
    x_test = test_set.iloc[:,:-1]
    y_test = test_set.iloc[:,-1]

    x_train_transf = full_pipeline.fit_transform(x_train)
    x_test_transf = full_pipeline.transform(x_test)
    #### EN ESTE CASO PORQUE QUERIA DIFERENCIAR ANOMALY CON 1 Y NORMAL CON 0
    y_train_transf = y_train.replace({"anomaly": 1, "normal": 0})
    y_test_transf = y_test.replace({"anomaly": 1, "normal": 0})
    return (x_train_transf,x_test_transf,y_train_transf,y_test_transf)

In [9]:
x_train_transf,x_test_transf,y_train_transf,y_test_transf = data_processing(df_num,df_obj)

In [10]:
clf = LogisticRegression()
clf.fit(x_train_transf,y_train_transf)
y_output = clf.predict(x_test_transf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
def calcular_precicion(y_test,y_predict):
    # Calcular métricas
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)
    accuracy = accuracy_score(y_test, y_predict)

    # Imprimir métricas
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)

In [12]:
calcular_precicion(y_test_transf,y_output)

Precision: 0.9767181649954814
Recall: 0.961817180150019
F1 Score: 0.9692104026988939
Accuracy: 0.9713832109545545
