# Librerías

In [20]:
# Importar librerías
import pandas as pd
import numpy as np
import configparser
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
import os

# Cargar y Validar los Datos

In [21]:
# Cargar datos procesados
file_path = '../data/interim/creditcard_balanced.csv' 
data = pd.read_csv(file_path)

In [22]:
# Inspección inicial
print("Dimensiones del dataset:", data.shape)
print("Primeras filas del dataset:")
display(data.head())

Dimensiones del dataset: (1476, 31)
Primeras filas del dataset:


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,19556.0,-1.567497,0.169245,2.371525,-1.77484,0.003882,-0.491739,-0.07248,-0.006021,2.6873,...,-0.06149,0.154483,-0.316773,0.035385,0.425683,-0.857536,-0.358959,0.251124,11.85,0
1,68207.0,-13.192671,12.785971,-9.90665,3.320337,-4.801176,5.760059,-18.750889,-37.353443,-0.39154,...,27.202839,-8.887017,5.303607,-0.639435,0.263203,-0.108877,1.269566,0.939407,1.0,1
2,93856.0,-6.750509,5.367416,-10.054635,9.064478,-7.968118,-2.263798,-10.317566,4.237666,-5.324109,...,1.909032,-0.34874,0.425001,0.674909,-0.784208,-0.247422,1.159581,0.197818,209.65,1
3,42181.0,-2.714728,-2.550618,3.023638,0.401269,2.274544,-1.215843,-1.73261,0.561313,1.009438,...,0.063049,-0.309606,0.345098,0.061864,0.100496,0.308751,-0.21316,-0.112034,0.76,0
4,102671.0,-4.991758,5.21334,-9.111326,8.431986,-3.435516,-1.827565,-7.114303,3.431207,-3.875643,...,1.189423,0.247858,0.294448,-0.548504,-0.174617,0.406703,-0.402339,-0.882886,0.0,1


In [23]:
print("Distribución de la variable objetivo (Class):")
if 'Class' in data.columns:
    print(data['Class'].value_counts(normalize=True))
else:
    print("La columna 'Class' no está presente en los datos.")

Distribución de la variable objetivo (Class):
Class
0    0.666667
1    0.333333
Name: proportion, dtype: float64


# Manejo de Valores Faltantes

In [24]:
# Verificar valores faltantes
print("\nValores faltantes por columna:")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])


Valores faltantes por columna:
Series([], dtype: int64)


In [25]:
# Porcentaje de valores faltantes
print("\nPorcentaje de valores faltantes por columna:")
missing_percentage = (missing_values / len(data)) * 100
print(missing_percentage[missing_percentage > 0])


Porcentaje de valores faltantes por columna:
Series([], dtype: float64)


In [26]:
if missing_values.sum() > 0:
    data.fillna(data.mean(), inplace=True)
    print("\nSe imputaron valores faltantes con la media.")
else:
    print("No se encontraron valores faltantes.")

No se encontraron valores faltantes.


# Identificación de tipos de variables

In [27]:
# Identificar características categóricas
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
print("\nColumnas categóricas identificadas:", categorical_cols)



Columnas categóricas identificadas: []


In [28]:
# Identificar características numéricas
numeric_cols = data.select_dtypes(include=np.number).columns.tolist()
print("\nColumnas numéricas identificadas:", numeric_cols)


Columnas numéricas identificadas: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']


# Generar archivo de configuración

In [29]:
def generate_config(data, target_column, output_path):
    """
    Genera un archivo de configuración basado en el análisis del dataset.

    Args:
        data (pd.DataFrame): Dataset procesado.
        target_column (str): Nombre de la columna objetivo.
        output_path (str): Ruta donde se guardará el archivo de configuración.
    """
    config = configparser.ConfigParser()

    # [GENERAL]
    redundant_features = []
    correlation_matrix = data.corr()
    for col in correlation_matrix.columns:
        high_corr = correlation_matrix[col][correlation_matrix[col] > 0.8].index.drop(col)
        if len(high_corr) > 0:
            redundant_features.append(col)
    config['GENERAL'] = {
        'VARS_TO_DROP': ', '.join(redundant_features),
        'TARGET': target_column
    }

    # [CONTINUES]
    vars_to_impute_continues = [col for col in numeric_cols if data[col].isnull().sum() > 0]
    config['CONTINUES'] = {
        'VARS_TO_IMPUTE': ', '.join(vars_to_impute_continues)
    }

    # [CATEGORICAL]
    vars_to_impute_categorical = [col for col in categorical_cols if data[col].isnull().sum() > 0]
    ohe_vars = [col for col in categorical_cols if data[col].nunique() <= 10]
    freq_enc_vars = [col for col in categorical_cols if data[col].nunique() > 10]

    config['CATEGORICAL'] = {
        'VARS_TO_IMPUTE': ', '.join(vars_to_impute_categorical),
        'OHE_VARS': ', '.join(ohe_vars),
        'FREQUENCY_ENC_VARS': ', '.join(freq_enc_vars)
    }

    with open(output_path, 'w') as configfile:
        config.write(configfile)

    print(f"Archivo de configuración generado en: {output_path}")

In [30]:
config_path = '../pipeline.cfg'
generate_config(data, target_column='Class', output_path=config_path)

Archivo de configuración generado en: ../pipeline.cfg
