In [None]:
# 1. Importar librerías y cargar el dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
try:
    from xgboost import XGBRegressor
    xgb_available = True
except ImportError:
    xgb_available = False

# Leer el dataset
df = pd.read_csv('../Models/meps_ml_dataset.csv')
display(df.head())
print('Columnas:', df.columns.tolist())

# 2. Crear función para obtener los límites de cada categoría para cada persona
def limites_personalizados(row, df, edad_col='edad', sexo_col='sexo_Male', prima_col='prima_out_of_pocket_editada'):
    edad = row[edad_col]
    sexo = row[sexo_col]
    mask = (df[edad_col].between(edad-3, edad+3)) & (df[sexo_col]==sexo)
    similares = df[mask][prima_col]
    if len(similares) < 10:
        similares = df[prima_col]  # fallback a toda la muestra si hay pocos similares
    q1 = similares.quantile(0.25)
    q2 = similares.quantile(0.5)
    q3 = similares.quantile(0.75)
    return [q1, q2, q3]

# 3. Crear columnas con los límites para cada persona
df[['limite_excelente','limite_bueno','limite_regular']] = df.apply(lambda row: pd.Series(limites_personalizados(row, df)), axis=1)
display(df[['limite_excelente','limite_bueno','limite_regular']].head())

# 4. Definir variables predictoras y variables objetivo (los tres límites)
X = df.drop(columns=['prima_out_of_pocket_editada','limite_excelente','limite_bueno','limite_regular'])
y = df[['limite_excelente','limite_bueno','limite_regular']]

# 5. Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 6. Definir modelos y grids de hiperparámetros
modelos_dict = {
    'LinearRegression': (LinearRegression(), {
        'fit_intercept': [True, False],
        'copy_X': [True, False],
        'positive': [False, True]
    }),
    'GradientBoosting': (GradientBoostingRegressor(random_state=42), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    }),
    'RandomForest': (RandomForestRegressor(random_state=42), {
        'n_estimators': [100, 200],
        'max_depth': [5, 10]
    })
}
if xgb_available:
    modelos_dict['XGBoost'] = (XGBRegressor(random_state=42, objective='reg:squarederror'), {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    })

# 7. Entrenar y comparar modelos para cada límite
resultados = {}
for target in y.columns:
    print(f'\n--- Predicción de: {target} ---')
    resultados[target] = {}
    for nombre, (modelo, param_grid) in modelos_dict.items():
        print(f'Entrenando y ajustando: {nombre}')
        if param_grid is not None:
            grid = GridSearchCV(modelo, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
            grid.fit(X_train, y_train[target])
            best_model = grid.best_estimator_
            best_params = grid.best_params_
        else:
            modelo.fit(X_train, y_train[target])
            best_model = modelo
            best_params = {}
        y_pred = best_model.predict(X_test)
        mae = mean_absolute_error(y_test[target], y_pred)
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        r2 = r2_score(y_test[target], y_pred)
        resultados[target][nombre] = {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'best_params': best_params}
        # Curva de aprendizaje para todos los modelos
        train_sizes, train_scores, test_scores = learning_curve(
            best_model, X_train, y_train[target], cv=3, scoring='neg_root_mean_squared_error',
            train_sizes=np.linspace(0.1, 1.0, 5), n_jobs=-1
        )
        train_scores_mean = -np.mean(train_scores, axis=1)
        test_scores_mean = -np.mean(test_scores, axis=1)
        plt.figure(figsize=(6,4))
        plt.plot(train_sizes, train_scores_mean, 'o-', label='Entrenamiento')
        plt.plot(train_sizes, test_scores_mean, 'o-', label='Validación')
        plt.title(f'Curva de aprendizaje - {target} - {nombre}')
        plt.xlabel('Tamaño de muestra')
        plt.ylabel('RMSE')
        plt.legend()
        plt.grid(True)
        plt.show()

# 8. Mostrar resumen de métricas y mejores hiperparámetros
pd.set_option('display.max_colwidth', None)
for target in y.columns:
    print(f'\nResumen para {target}:')
    df_res = pd.DataFrame(resultados[target]).T[['MAE','RMSE','R2','best_params']]
    display(df_res)

Unnamed: 0,edad,estado_salud_percibido,ccsr_num_total,ccsr_otra_condicion,sexo_Male,raza_etnicidad_Non-Hispanic Asian only,raza_etnicidad_Non-Hispanic Black only,raza_etnicidad_Non-Hispanic Other race or multi-race,raza_etnicidad_Non-Hispanic White only,estado_civil_Married,...,ccsr_Osteoporosis,ccsr_Other specified bone disease and musculoskeletal deformities,ccsr_Abnormal findings without diagnosis,ccsr_Other and ill-defined heart disease,ccsr_Neurodevelopmental disorders,ccsr_Nutritional deficiencies,ccsr_Other specified upper respiratory infections,ccsr_Other specified inflammatory condition of skin,ccsr_Acquired foot deformities,prima_out_of_pocket_editada
0,29.0,1,7,5,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,357.5
1,51.0,4,4,4,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,215.88
2,53.0,4,10,5,0,0,0,0,1,1,...,0,0,1,0,0,1,0,0,0,315.0
3,69.0,4,6,4,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,315.0
4,37.0,4,1,1,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,140.44


Columnas: ['edad', 'estado_salud_percibido', 'ccsr_num_total', 'ccsr_otra_condicion', 'sexo_Male', 'raza_etnicidad_Non-Hispanic Asian only', 'raza_etnicidad_Non-Hispanic Black only', 'raza_etnicidad_Non-Hispanic Other race or multi-race', 'raza_etnicidad_Non-Hispanic White only', 'estado_civil_Married', 'estado_civil_Never married', 'estado_civil_Separated', 'estado_civil_Under 16 - not applicable', 'estado_civil_Widowed', 'region_Midwest', 'region_Northeast', 'region_South', 'region_West', 'ccsr_Essential hypertension', 'ccsr_Disorders of lipid metabolism', 'ccsr_Diabetes mellitus without complication', 'ccsr_Bacterial infections', 'ccsr_Osteoarthritis', 'ccsr_Cataract and other lens disorders', 'ccsr_Esophageal disorders', 'ccsr_Retinal and vitreous conditions', 'ccsr_Other general signs and symptoms', 'ccsr_Thyroid disorders', 'ccsr_Otitis media', 'ccsr_Osteoporosis', 'ccsr_Other specified bone disease and musculoskeletal deformities', 'ccsr_Abnormal findings without diagnosis', 'cc

Unnamed: 0,limite_excelente,limite_bueno,limite_regular
0,100.0,170.585,300.0
1,143.49,281.67,492.75
2,150.0,280.0,478.5
3,113.6675,186.5,337.25
4,113.33625,214.795,433.33



--- Predicción de: limite_excelente ---
Entrenando y ajustando: LinearRegression


ValueError: Invalid parameter 'normalize' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].