## Imports

In [146]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PowerTransformer, QuantileTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import r_regression
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
import matplotlib.pyplot as plt

## Preparación de Datos para Regresión

In [147]:
# Leer el CSV
data = pd.read_csv('../../data/EstudioCrediticio_TrainP.csv')

# Eliminar la primera columna
data = data.iloc[:, 1:]
# Eliminar la penúltima columna
data = data.iloc[:, :-2].join(data.iloc[:, -1])
# Eliminar las columnas con correlación alta (más de 0.9)
data = data.drop(['IngresoBrutoAnual', 'Experiencia', 'TotalActivos'], axis=1)

# Separar las características (X) y la variable a predecir (y)
X = data.iloc[:, :-1]  # Todas las columnas menos la última
y = data.iloc[:, -1]   # La última columna

# Identificar columnas numéricas y categóricas
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# POLINOMIAL

In [15]:
class PearsonCorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.to_drop_ = self._get_correlated_features(X)
        return self

    def transform(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        return X.drop(columns=self.to_drop_, errors='ignore')

    def _get_correlated_features(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return to_drop

In [16]:
# Pipeline para variables categóricas (Imputación + OneHotEncoder)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputar con el valor más frecuente
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline para variables numéricas (Imputación + Escalado)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputar con la media
    ('scaler', PowerTransformer(method='yeo-johnson')), # Escalar debido a outliers
])

# Combinación de transformaciones para columnas categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# X_enc = preprocessor.fit_transform(X)

In [10]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polfeatures', PolynomialFeatures(degree=2)),
    ('pearson_filter', PearsonCorrelationFilter(threshold=0.9)),
    ('regressor', LinearRegression())
    ])

In [11]:
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'Coeficiente de determinación (R^2): {r2}')

Coeficiente de determinación (R^2): 0.8500930061326782


# Bagging Boosting

In [153]:
# Pipeline para variables categóricas (Imputación + OneHotEncoder)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputar con el valor más frecuente
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Pipeline para variables numéricas (Imputación + Escalado)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputar con la media
    ('scaler', PowerTransformer(method='yeo-johnson')), # Escalar debido a outliers
])

# Combinación de transformaciones para columnas categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [173]:
from sklearn.ensemble import BaggingRegressor

regerssion = MLPRegressor(solver='adam', hidden_layer_sizes=(64,32), activation='tanh', max_iter=1000, early_stopping=True)

# Crear pipeline final con preprocesamiento y modelo
model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', BaggingRegressor(estimator=regerssion, n_estimators=25, random_state=42))
])

In [174]:
# Ejecutar RandomizedSearchCV
model.fit(X_train, y_train)

In [176]:
# Evaluar el modelo
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"R² del conjunto de entrenamiento: {train_score:.4f}")
print(f"R² del conjunto de prueba: {test_score:.4f}")

y_pred = model.predict(X_test)

# Calcular métricas para regresión
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

R² del conjunto de entrenamiento: 0.9601
R² del conjunto de prueba: 0.9262
RMSE: 2.1506711806607126
MAE: 1.262247458184249
