# Creación de Transformadores y Pipelines Personalizados

In [4]:
import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [5]:
from sklearn.model_selection import train_test_split

# División en entrenamiento, validación y prueba
def train_val_test_split(df, random_state=42, stratify=None):
    """
    Divide un DataFrame en entrenamiento (60%), validación (20%) y prueba (20%).
    
    Args:
        df (DataFrame): El dataset a dividir.
        random_state (int): Semilla para garantizar reproducibilidad.
        stratify (str): Columna para estratificación (opcional).
        
    Returns:
        tuple: DataFrames para entrenamiento, validación y prueba.
    """
    strat = df[stratify] if stratify else None
    train_set, temp_set = train_test_split(
        df, test_size=0.4, random_state=random_state, stratify=strat)
    strat_temp = temp_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        temp_set, test_size=0.5, random_state=random_state, stratify=strat_temp)
    return train_set, val_set, test_set

In [6]:
# Leer el dataset
file_path = '/home/abril/anaconda3/envs/Simulacion/datasets/datasets/sales_dataset_50000.csv'  
data = pd.read_csv(file_path)

In [7]:
data

Unnamed: 0,Transaction_ID,Date,Product,Category,Quantity_Sold,Unit_Price,Discount,Region,Payment_Method,Total
0,4c98f095-7aa9-4e73-a94b-a79e84c83cc7,2024-12-07,Smartphone,Electronics,9,958.94,0.15,South,Cash,7335.89
1,68ff3320-6da2-4747-8851-b6aa687445ab,2023-05-10,Monitor,Peripherals,4,1045.36,0.39,North,Credit Card,2550.68
2,b48d6d6a-fdf3-4f0d-9da5-6c511bf016d5,2024-03-02,Laptop,Electronics,7,1929.63,0.49,West,Credit Card,6888.78
3,ac017db5-a08e-471c-a685-bec091e29345,2023-12-01,Headphones,Accessories,9,710.64,0.41,West,Credit Card,3773.50
4,571c3ded-3a36-451b-940a-dd489d1c303b,2024-12-31,Tablet,Electronics,2,14.76,0.01,East,Cash,29.22
...,...,...,...,...,...,...,...,...,...,...
49995,8593e5c4-ebda-4003-bf66-3f256fd915de,2024-11-07,Keyboard,Accessories,7,318.21,0.12,West,Cash,1960.17
49996,ee6933c4-c88f-4618-8486-721a3bcc0ad0,2023-12-14,Laptop,Accessories,8,1225.89,0.33,South,Bank Transfer,6570.77
49997,28a576ad-b0b8-4958-b17c-20f8bbabf2d4,2023-07-25,Headphones,Accessories,4,1308.80,0.26,West,Bank Transfer,3874.05
49998,ee8f8bab-4e4d-47c7-9fc7-4a0d54481a19,2024-09-07,Headphones,Electronics,4,1447.92,0.03,West,Bank Transfer,5617.93


In [9]:
train_set, val_set, test_set = train_val_test_split(data, stratify = 'Product')

In [10]:
print('Longitud del Training Set', len(train_set))
print('Longitud del Validation', len(val_set))
print('Longitud del Test Set', len(test_set))

Longitud del Training Set 30000
Longitud del Validation 10000
Longitud del Test Set 10000


In [11]:
X_train = train_set.drop("Total", axis=1)
y_train = train_set["Total"].copy()

In [14]:
# Trasnformador creado para eliminar las filas con valos nulos 
from sklearn.base import BaseEstimator, TransformerMixin

class DeleteNanRows(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y= None):
        return self
    def transform(self, X, y= None):
        return X.dropna()

In [15]:
delete_nan = DeleteNanRows()
X_train_prep = delete_nan.fit_transform(X_train)

In [16]:
X_train_prep

Unnamed: 0,Transaction_ID,Date,Product,Category,Quantity_Sold,Unit_Price,Discount,Region,Payment_Method
28687,7a1857bb-a8b8-4b18-9fb0-3278ad966513,2024-11-09,Tablet,Peripherals,10,304.94,0.30,North,Cash
7504,5ef38507-c758-47ee-ad61-49f88d4045fe,2023-08-03,Smartphone,Electronics,3,328.47,0.03,South,Cash
23254,d990cdb0-3105-4930-bd47-6f7172d68e73,2024-06-23,Tablet,Electronics,10,1497.27,0.06,South,Bank Transfer
11291,d7a5b9ed-7cf0-4b92-b6fb-5df17f89723d,2024-07-16,Monitor,Accessories,9,166.62,0.40,West,Digital Wallet
11467,95b65b6b-8918-47ad-8e86-5ee32fd25bfd,2024-01-23,Headphones,Peripherals,9,707.99,0.33,West,Cash
...,...,...,...,...,...,...,...,...,...
13094,29a838b7-cee9-4968-88d4-8b72a269aa77,2023-07-01,Smartphone,Accessories,7,1066.51,0.08,West,Cash
30502,700014b9-479b-48bd-86f6-5578816bc96f,2024-11-05,Keyboard,Accessories,3,510.36,0.40,North,Digital Wallet
33544,83535590-307d-4183-b998-7e7c6c7339d3,2023-08-09,Smartphone,Accessories,8,969.19,0.35,West,Credit Card
11888,3f7793a8-a5a2-4167-a002-40b40a46ee50,2024-11-18,Monitor,Electronics,1,1416.93,0.43,South,Bank Transfer


In [17]:
# Transformador diseñado para escalar de manera sencilla unicamente algunas columnas seleccionadas
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y = None):
        return self # Nothing else to do
    def transform(self, X, y = None):
        X_copy = X.copy()
        scale_attr = X_copy[self.attributes]
        robust_scaler = RobustScaler()
        X_scaled = robust_scaler.fit_transform(scale_attr)
        X_scaled = pd.DataFrame(X_scaled, columns = self.attributes, index = X_copy.index)
        for attr in self.attributes:
            X_copy[attr] = X_scaled[attr]
        return X_copy

In [22]:
custom_scaler = CustomScaler(["Unit_Price", "Discount"])
X_train_prep = custom_scaler.fit_transform(X_train_prep)

In [23]:
X_train.head(10)

Unnamed: 0,Transaction_ID,Date,Product,Category,Quantity_Sold,Unit_Price,Discount,Region,Payment_Method
28687,7a1857bb-a8b8-4b18-9fb0-3278ad966513,2024-11-09,Tablet,Peripherals,10,304.94,0.3,North,Cash
7504,5ef38507-c758-47ee-ad61-49f88d4045fe,2023-08-03,Smartphone,Electronics,3,328.47,0.03,South,Cash
23254,d990cdb0-3105-4930-bd47-6f7172d68e73,2024-06-23,Tablet,Electronics,10,1497.27,0.06,South,Bank Transfer
11291,d7a5b9ed-7cf0-4b92-b6fb-5df17f89723d,2024-07-16,Monitor,Accessories,9,166.62,0.4,West,Digital Wallet
11467,95b65b6b-8918-47ad-8e86-5ee32fd25bfd,2024-01-23,Headphones,Peripherals,9,707.99,0.33,West,Cash
44461,5a8c1af7-34a3-4f22-9cfe-4e254440ee44,2023-09-18,Headphones,Accessories,1,1392.44,0.07,East,Credit Card
39170,1fb84f24-0cfc-492f-ab25-a5e593dd1434,2024-10-27,Headphones,Peripherals,4,755.23,0.01,South,Cash
10566,b9adaa4d-0edd-46b0-9459-328d6c194671,2025-01-06,Mouse,Peripherals,10,748.96,0.19,East,Bank Transfer
13254,6f769894-6123-49c5-8fbc-3c114795c0e8,2023-02-19,Tablet,Accessories,7,79.86,0.05,South,Digital Wallet
35847,bc8adaf5-b253-47b0-895a-3b0905f3d4bd,2023-07-13,Laptop,Peripherals,9,1368.04,0.4,North,Digital Wallet


In [24]:
X_train_prep.head(10)

Unnamed: 0,Transaction_ID,Date,Product,Category,Quantity_Sold,Unit_Price,Discount,Region,Payment_Method
28687,7a1857bb-a8b8-4b18-9fb0-3278ad966513,2024-11-09,Tablet,Peripherals,10,-0.704365,0.192308,North,Cash
7504,5ef38507-c758-47ee-ad61-49f88d4045fe,2023-08-03,Smartphone,Electronics,3,-0.680488,-0.846154,South,Cash
23254,d990cdb0-3105-4930-bd47-6f7172d68e73,2024-06-23,Tablet,Electronics,10,0.505554,-0.730769,South,Bank Transfer
11291,d7a5b9ed-7cf0-4b92-b6fb-5df17f89723d,2024-07-16,Monitor,Accessories,9,-0.844725,0.576923,West,Digital Wallet
11467,95b65b6b-8918-47ad-8e86-5ee32fd25bfd,2024-01-23,Headphones,Peripherals,9,-0.295369,0.307692,West,Cash
44461,5a8c1af7-34a3-4f22-9cfe-4e254440ee44,2023-09-18,Headphones,Accessories,1,0.399178,-0.692308,East,Credit Card
39170,1fb84f24-0cfc-492f-ab25-a5e593dd1434,2024-10-27,Headphones,Peripherals,4,-0.247432,-0.923077,South,Cash
10566,b9adaa4d-0edd-46b0-9459-328d6c194671,2025-01-06,Mouse,Peripherals,10,-0.253795,-0.230769,East,Bank Transfer
13254,6f769894-6123-49c5-8fbc-3c114795c0e8,2023-02-19,Tablet,Accessories,7,-0.932765,-0.769231,South,Digital Wallet
35847,bc8adaf5-b253-47b0-895a-3b0905f3d4bd,2023-07-13,Laptop,Peripherals,9,0.374418,0.576923,North,Digital Wallet


In [25]:
# Trasnformador para codificar únicamente las columnas categóricas y devolver un DataFrame.
class CustomOneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.oh = OneHotEncoder(sparse=False)
        self._columns = None
    def fit(self, X, y = None):
        X_cat = X.select_dtypes(include=['object'])
        self._columns = pd.get_dummies(X_cat).columns
        self._oh.fit(X_cat)
        return self
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_cat = X.copy.select_dtypes(include = ['object'])
        X_num = X.copy.select_dtypes(exclude = ['object'])
        X_cat_oh = self._oh.transform(X_cat)
        X_cat_oh = pd.DataFrame(X_cat_oh,
                               columns = self._columns,
                               index = X_copy.index)
        X_copy.drop(list(X_cat), axis=1, inplace=True)
        return X_copy.join(X_cat_oh)

In [26]:
# Construcción de un Pipeline para los atributos númericos
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('rbst_scaler', RobustScaler()),
])

In [27]:
# La clase Imputer no admite valores categóricos, se eliminan los atributos categóricos
X_train_num = X_train.select_dtypes(exclude=['object'])
X_train_prep = num_pipeline.fit_transform(X_train_num)
X_train_prep = pd.DataFrame(X_train_prep, columns=X_train_num.columns, index=X_train_num.index)

In [28]:
X_train_num.head(10)

Unnamed: 0,Quantity_Sold,Unit_Price,Discount
28687,10,304.94,0.3
7504,3,328.47,0.03
23254,10,1497.27,0.06
11291,9,166.62,0.4
11467,9,707.99,0.33
44461,1,1392.44,0.07
39170,4,755.23,0.01
10566,10,748.96,0.19
13254,7,79.86,0.05
35847,9,1368.04,0.4


In [29]:
X_train_prep.head(10)

Unnamed: 0,Quantity_Sold,Unit_Price,Discount
28687,0.8,-0.704365,0.192308
7504,-0.6,-0.680488,-0.846154
23254,0.8,0.505554,-0.730769
11291,0.6,-0.844725,0.576923
11467,0.6,-0.295369,0.307692
44461,-1.0,0.399178,-0.692308
39170,-0.4,-0.247432,-0.923077
10566,0.8,-0.253795,-0.230769
13254,0.2,-0.932765,-0.769231
35847,0.6,0.374418,0.576923


In [36]:
X_train.head(10)

Unnamed: 0,Transaction_ID,Date,Product,Category,Quantity_Sold,Unit_Price,Discount,Region,Payment_Method
28687,7a1857bb-a8b8-4b18-9fb0-3278ad966513,2024-11-09,Tablet,Peripherals,10,304.94,0.3,North,Cash
7504,5ef38507-c758-47ee-ad61-49f88d4045fe,2023-08-03,Smartphone,Electronics,3,328.47,0.03,South,Cash
23254,d990cdb0-3105-4930-bd47-6f7172d68e73,2024-06-23,Tablet,Electronics,10,1497.27,0.06,South,Bank Transfer
11291,d7a5b9ed-7cf0-4b92-b6fb-5df17f89723d,2024-07-16,Monitor,Accessories,9,166.62,0.4,West,Digital Wallet
11467,95b65b6b-8918-47ad-8e86-5ee32fd25bfd,2024-01-23,Headphones,Peripherals,9,707.99,0.33,West,Cash
44461,5a8c1af7-34a3-4f22-9cfe-4e254440ee44,2023-09-18,Headphones,Accessories,1,1392.44,0.07,East,Credit Card
39170,1fb84f24-0cfc-492f-ab25-a5e593dd1434,2024-10-27,Headphones,Peripherals,4,755.23,0.01,South,Cash
10566,b9adaa4d-0edd-46b0-9459-328d6c194671,2025-01-06,Mouse,Peripherals,10,748.96,0.19,East,Bank Transfer
13254,6f769894-6123-49c5-8fbc-3c114795c0e8,2023-02-19,Tablet,Accessories,7,79.86,0.05,South,Digital Wallet
35847,bc8adaf5-b253-47b0-895a-3b0905f3d4bd,2023-07-13,Laptop,Peripherals,9,1368.04,0.4,North,Digital Wallet
