In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb

# Cargar los datos
data = pd.read_csv('../data/FS_final_train.csv', index_col=0)

# Separar las características y la variable objetivo
X = data.drop(columns=['log_SalePrice'])
y = data['log_SalePrice']

# Estandarizar las características
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
X_scaled.dropna(inplace=True)

#Eliminar la fila eliminada en y tambien en X
dropped_rows = y.index.difference(X_scaled.index)
y_scaled = y.drop(index=dropped_rows)

In [None]:
# # De las correlaciones altas nos quedamos con las que tienen mas correlacion con la target
# corr_matrix = X_scaled.corr()

# #Eliminar las diagonales
# corr_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# corr_matrix = corr_matrix.stack().reset_index()
# corr_matrix.columns = ['Feature 1', 'Feature 2', 'Correlation']

# # De las parejas con abs(correlacion > 0.9) nos quedamos con la que tiene mas correlacion con la target
# corr_matrix = corr_matrix[corr_matrix['Correlation'].abs() > 0.9]
# corr_matrix = corr_matrix.sort_values(by='Correlation', ascending=False)

# # de la pareja eliminar la que menos corr tenga con la target y, usar corrwith
# features_to_drop = []
# for index, row in corr_matrix.iterrows():
#     feature1 = row['Feature 1']
#     feature2 = row['Feature 2']
#     if y.corr(X[feature1]) < y.corr(X[feature2]):
#         features_to_drop.append(feature1)
#     else:
#         features_to_drop.append(feature2)

# X_scaled = X_scaled.drop(columns=features_to_drop)

In [None]:
# After loading and preprocessing the data
print("X shape:", X_scaled.shape)
print("y shape:", y_scaled.shape)
print("X info:")
print(X.info())
print("\nX describe:")
print(X.describe())
print("\ny describe:")
print(y.describe())

# Check for NaN or infinite values
print("\nNaN in X:", X_scaled.isna().sum().sum())
print("NaN in y:", y_scaled.isna().sum())
print("Inf in X:", np.isinf(X_scaled).sum().sum())
print("Inf in y:", np.isinf(y_scaled).sum())

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import GradientBoostingRegressor
from imblearn.over_sampling import RandomOverSampler
import optuna

In [None]:
def custom_rmse_scorer(y_true, y_pred):
    try:
        mse = mean_squared_error(y_true, y_pred)
        if np.isnan(mse) or np.isinf(mse):
            print(f"MSE is {mse}")
            print(f"y_true: min={np.min(y_true)}, max={np.max(y_true)}, mean={np.mean(y_true)}")
            print(f"y_pred: min={np.min(y_pred)}, max={np.max(y_pred)}, mean={np.mean(y_pred)}")
        return np.sqrt(mse)
    except Exception as e:
        print(f"Error in RMSE calculation: {e}")
        print(f"y_true shape: {y_true.shape}, y_pred shape: {y_pred.shape}")
        print(f"y_true: {y_true[:5]}, y_pred: {y_pred[:5]}")
        return np.nan

rmse_scorer = make_scorer(custom_rmse_scorer, greater_is_better=False)

In [None]:
def custom_oversample_data(X, y, percentage=0.2, noise_level=0.01):
    #Random seed
    random_seed = 42

    np.random.seed(random_seed)
    # Seleccionar aleatoriamente un porcentaje de los datos
    sample_size = int(len(y) * percentage)
    random_indices = np.random.choice(y.index, size=sample_size, replace=True)
    
    # Crear muestras con ruido
    X_sampled = X.loc[random_indices]
    y_sampled = y.loc[random_indices] + np.random.normal(0, noise_level, size=sample_size)
    
    # Combinar las muestras originales con las sobremuestreadas
    X_resampled = pd.concat([X, X_sampled], axis=0)
    y_resampled = pd.concat([y, y_sampled], axis=0)
    
    
    return X_resampled, y_resampled

In [None]:
import optuna
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Configuración comn de Optuna
def create_study(name, version=None):
    return optuna.create_study(study_name=f'{name}_housing_{version}', directions = ['minimize', 'minimize'],
                               sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_min_trials=2),
                               load_if_exists=True, storage=f'sqlite:///../models/{name}_housing_{version}.db')
def objective_gb(trial):
    # Parámetros para decidir si aplicar sobremuestreo y filtrado de correlación
    do_oversampling = trial.suggest_categorical('do_oversampling', [True, False])

    # Definición de parámetros para GradientBoostingRegressor
    params = {
        'loss': trial.suggest_categorical('loss', ['squared_error']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'criterion': trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error']),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'min_weight_fraction_leaf': trial.suggest_uniform('min_weight_fraction_leaf', 0.0, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 40),
        'min_impurity_decrease': trial.suggest_uniform('min_impurity_decrease', 0.0, 1.0),
        'init': None,
        'random_state': 42,
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'alpha': trial.suggest_uniform('alpha', 0.0, 1.0),
        'verbose': 0,
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 20),
        'warm_start': False,
        'validation_fraction': 0.2,
        'n_iter_no_change': 100,
        'tol': 1e-4,
        'ccp_alpha': trial.suggest_uniform('ccp_alpha', 0.0, 0.1)
    }
    
    model = GradientBoostingRegressor(**params)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    val_rmses = []
    train_rmses = []
    
    # Aplicar filtrado de correlación si se selecciona
    
    for train_index, val_index in kf.split(X_train):
        X_train_val, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_val, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # Aplicar sobremuestreo si se selecciona
        if do_oversampling:
            X_train_val, y_train_val = custom_oversample_data(X_train_val, y_train_val)
        
        model.fit(X_train_val, y_train_val)
        val_pred = model.predict(X_val)
        train_pred = model.predict(X_train_val)
        
        val_rmse = custom_rmse_scorer(y_val, val_pred)
        train_rmse = custom_rmse_scorer(y_train_val, train_pred)
        
        val_rmses.append(val_rmse)
        train_rmses.append(train_rmse)
    
    val_rmse = np.mean(val_rmses)
    train_rmse = np.mean(train_rmses)
    
    print(f"Val RMSE: {val_rmse}, Train RMSE: {train_rmse}")
    
    rmse_diff = val_rmse - train_rmse
    
    return val_rmse, rmse_diff

study_gb = create_study('gb_FS_Oversampling', version='2.3')
study_gb.optimize(objective_gb, n_trials=100, show_progress_bar=True, gc_after_trial=True)

In [None]:
print(f'Direcciones de optimización: {study_gb.directions}')
print(f'Nmero de ensayos: {study_gb.trials.__len__()}')
# print(f'Mejor ensayo: {study_gb.best_trial}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor valor: {study_gb.best_value}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor hiperparámetros: {study_gb.best_params}') solo se puede utilizar si la optimización es simple no multiple
study_gb_df = study_gb.trials_dataframe()
study_gb_df.sort_values(by='values_0', ascending=True).head(5)

In [None]:
def obtain_best_params(study_gb, weights:np.ndarray) -> dict:
    '''
    Obtenemos los best params atraves de asociar un peso deseado a cada métrica que estamos optimizando
    -------
    Parámetros:
    - study_xgb: optuna.Study, estudio de Optuna.
    - weights: np.ndarray, pesos deseados para cada métrica.
    -------
    Devuelve:
    - best_params: dict, hiperparámetros óptimos.
    '''
    study_df = study_gb.trials_dataframe()
    study_df['weighted_average'] = np.average(study_df[[col for col in study_df.columns if 'values' in col]], weights=weights, axis=1)

    # Encontrar el trial con el menor promedio ponderado
    best_trial_index = study_df['weighted_average'].idxmin()
    best_trial = study_df.loc[best_trial_index]

    best_trial = [trial for trial in study_gb.best_trials if trial.number == best_trial_index]
    best_params = best_trial[0].params
    return best_params, best_trial[0].values

In [None]:
best_params, best_values = obtain_best_params(study_gb, weights=np.array([0.75, 0.25]))
print(f'Mejores hiperparámetros: {best_params}')
print(f'Mejores valores: Mean rmse Validation: {best_values[0]}, Mean Diff (Val - Train) rmse: {best_values[1]}')

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

# Entrenar el modelo con los mejores hiperparámetros encontrados
# Eliminar do_oversampling
do_oversampling = best_params.pop('do_oversampling')
model = GradientBoostingRegressor(random_state=42, **best_params)

if do_oversampling:
    X_train_over, y_train_over = custom_oversample_data(X_train, y_train)
    model.fit(X_train_over, y_train_over)
else:
    model.fit(X_train, y_train)

# Predecir sobre el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar todas las métricas de regresión
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R^2 Score: {r2}')
