# Evaluación de Resultados

In [1]:
import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def load_kdd_dataset(data_path):
    """Lectura del DataSet NSL-KDD."""
    with open(data_path, 'r') as train_set:
        dataset = arff.load(train_set)
    attributes = [attr[0] for attr in dataset['attributes']]
    return pd.DataFrame(dataset['data'], columns = attributes)

In [3]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [4]:
# Construcción de un Pipeline para los atributos númericos
num_pipeline = Pipeline([
    ('Category', SimpleImputer(strategy="median")),
    ('rbst_scaler', RobustScaler()),
])

In [5]:
# Trasnformador para codificar únicamente las columnas categóricas y devolver un DataFrame.
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._oh = OneHotEncoder(sparse_output=False)
        self._columns = None
    def fit(self, X, y = None):
        X_cat = X.select_dtypes(include=['object'])
        self._columns = pd.get_dummies(X_cat).columns
        self._oh.fit(X_cat)
        return self
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_cat = X_copy.select_dtypes(include = ['object'])
        X_num = X_copy.select_dtypes(exclude = ['object'])
        X_cat_oh = self._oh.transform(X_cat)
        X_cat_oh = pd.DataFrame(X_cat_oh,
                               columns = self._columns,
                               index = X_copy.index)
        X_copy.drop(list(X_cat), axis=1, inplace=True)
        return X_copy.join(X_cat_oh)

In [6]:
# Trasnformador que prepara todo el DataSet llamado Pipelines y Transformadores personalizados.
class DataFramePreparer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._full_pipeline = None
        self._columns = None
    def fit(self, X, y = None):
        num_attribs = list(X.select_dtypes(exclude = ['object']))
        cat_attribs = list(X.select_dtypes(include = ['object']))
        self._full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", CustomOneHotEncoder(), cat_attribs),
        ])
        self._full_pipeline.fit(X)
        self._columns = pd.get_dummies(X).columns
        return self
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_prep = self._full_pipeline.transform(X_copy)
        return pd.DataFrame(X_prep,
                           columns = self._columns,
                           index = X_copy.index)

In [7]:
# Leer el dataset
file_path = '/home/abril/anaconda3/envs/Simulacion/datasets/datasets/sales_dataset_50000.csv'  
data = pd.read_csv(file_path)

In [8]:
data

Unnamed: 0,Transaction_ID,Date,Product,Category,Quantity_Sold,Unit_Price,Discount,Region,Payment_Method,Total
0,4c98f095-7aa9-4e73-a94b-a79e84c83cc7,2024-12-07,Smartphone,Electronics,9,958.94,0.15,South,Cash,7335.89
1,68ff3320-6da2-4747-8851-b6aa687445ab,2023-05-10,Monitor,Peripherals,4,1045.36,0.39,North,Credit Card,2550.68
2,b48d6d6a-fdf3-4f0d-9da5-6c511bf016d5,2024-03-02,Laptop,Electronics,7,1929.63,0.49,West,Credit Card,6888.78
3,ac017db5-a08e-471c-a685-bec091e29345,2023-12-01,Headphones,Accessories,9,710.64,0.41,West,Credit Card,3773.50
4,571c3ded-3a36-451b-940a-dd489d1c303b,2024-12-31,Tablet,Electronics,2,14.76,0.01,East,Cash,29.22
...,...,...,...,...,...,...,...,...,...,...
49995,8593e5c4-ebda-4003-bf66-3f256fd915de,2024-11-07,Keyboard,Accessories,7,318.21,0.12,West,Cash,1960.17
49996,ee6933c4-c88f-4618-8486-721a3bcc0ad0,2023-12-14,Laptop,Accessories,8,1225.89,0.33,South,Bank Transfer,6570.77
49997,28a576ad-b0b8-4958-b17c-20f8bbabf2d4,2023-07-25,Headphones,Accessories,4,1308.80,0.26,West,Bank Transfer,3874.05
49998,ee8f8bab-4e4d-47c7-9fc7-4a0d54481a19,2024-09-07,Headphones,Electronics,4,1447.92,0.03,West,Bank Transfer,5617.93


In [9]:
train_set, val_set, test_set = train_val_test_split(data)

In [10]:
print('Longitud del Training Set', len(train_set))
print('Longitud del Validation', len(val_set))
print('Longitud del Test Set', len(test_set))

Longitud del Training Set 30000
Longitud del Validation 10000
Longitud del Test Set 10000


In [11]:
# DataSet General
X_data = data.drop("Product", axis = 1)
y_data = data["Product"].copy()

In [12]:
# DataSet de Entrenamiento
X_train = train_set.drop("Product", axis = 1)
y_train = train_set["Product"].copy()

In [13]:
# DataSet de Validación
X_val = val_set.drop("Product", axis = 1)
y_val = val_set["Product"].copy()

In [14]:
# DataSet para Pruebas
X_test = test_set.drop("Product", axis = 1)
y_test = test_set["Product"].copy()

In [15]:
# Instancear el transformador personalizado
data_preparer = DataFramePreparer()

In [None]:
# Hacer el fit con el DataSet general para que adquiera todos los valores posibles.
data_preparer.fit(X_data)

In [None]:
# Transformar el DataSet de Entrenamiento.
X_train_prep = data_preparer.transform(X_train)

In [None]:
X_train_prep.head(5)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter = 5000)
clf.fit(X_train_prep, y_train)

In [None]:
y_prep = clf.predict(X_val_prep)

In [None]:
from sklearn.metrics import confusion_matrix
#plot_confusion_matrix(clf, X_val_prep, y_val, values_format="3g")
confusion_matrix(y_val, y_prep)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(clf, X_val_prep, y_val, values_format="3g")

In [None]:
from sklearn.metrics import precision_score
print("Presición:", precision_score(y_val, y_prep, pos_label="Total"))

In [None]:
from sklearn.metrics import recall_score
print("Recall:", recall_score(y_val, y_prep, pos_label="Total"))

In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_estimator(clf, X_val_prep, y_val)

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

PrecisionRecallDisplay.from_estimator(clf, X_val_prep, y_val)

In [None]:
X_test_prep = data_preparer.transform(X_test)
y_pred = clf.predict(X_test_prep)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(clf, X_val_prep, y_val, values_format="3g")

In [None]:
from sklearn.metrics import f1_score
print("F1 score:", f1_score(y_test, y_pred, pos_label="Total"))