In [32]:
import pandas as pd
import numpy as np

In [33]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [34]:
print(df.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


## Prep / Feature Engineering

In [35]:
## TRANSFORMADOR AUXILIAR
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

In [36]:
# Funciones de Preprocesamiento:
def remove_duplicates(df):
    df.drop_duplicates(inplace=True)
    return df

def impute_missing_values(df):
    # Variables categóricas
    categorical_cols = df.select_dtypes(include='object').columns
    #categorical_cols = categorical_cols.drop('churn')  # Excluir 'churn'
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

    # Variables numéricas
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

    return df


def map_categorical_features(df):
    mapping = {
        'sex': {'female': 1, 'male':0},
        'smoker': {'yes': 1, 'no':0},
        'region': {'southwest': 0, 'southeast': 0.3, 'northwest':0.6, 'northeast': 1}
    }
    
    df.replace(mapping, inplace=True)

    return df

def normalize_numeric_features(df):
    numeric_cols = ['bmi', 'age', 'children']
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df


In [37]:
df = impute_missing_values(df)
df = remove_duplicates(df)
df = normalize_numeric_features(df)
df = map_categorical_features(df)

In [38]:
data = pd.DataFrame(df)

In [39]:
print(data.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [40]:
data = data.fillna(0)  # Reemplazar NaN por 0 u otro valor válido

### Optimización de Hiperparametros con Optuna

In [None]:
%pip install optuna xgboost

In [45]:
from sklearn.model_selection import StratifiedKFold #prb classificacion
from sklearn.model_selection import KFold #prb regression
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm
import optuna
from sklearn.model_selection import train_test_split

In [46]:
# Función de optimización de hiperparámetros
def objective(trial):
    params ={
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
        'max_depth' : trial.suggest_int('max_depth', 3, 10),
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000, step=100),
    }
   
    X = data.drop('charges', axis=1)
    y = data['charges']
    
    # División de datos en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    #model = XGBRegressor(**params)
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
        
    mse = mean_squared_error(y_test, y_pred)

    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=60)

print("Mejores hiperparámetros:")
print(study.best_params)

best_mse = study.best_value
print("Mejor MSE obtenido:", best_mse)

[I 2023-07-19 12:42:47,191] A new study created in memory with name: no-name-e5cf61f8-dbc1-412f-bf2c-1cac10d13fc1
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:47,580] Trial 0 finished with value: 18973092.274195425 and parameters: {'learning_rate': 0.016225891904172722, 'max_depth': 6, 'n_estimators': 400}. Best is trial 0 with value: 18973092.274195425.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:47,796] Trial 1 finished with value: 23686699.375961512 and parameters: {'learning_rate': 0.133263208291248, 'max_depth': 4, 'n_estimators': 800}. Best is trial 0 with value: 18973092.274195425.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:48,001] Trial 2 finished with value: 30173222.437516164 and parameters: {'learning_rate': 0.35949064975155265, 'max_depth': 8, 'n_estimators': 300}. Best is trial 0 with value: 18973092.274195425.
  'learning_rate' : 

[I 2023-07-19 12:42:51,563] Trial 14 finished with value: 18598151.602337696 and parameters: {'learning_rate': 0.03378231654202372, 'max_depth': 3, 'n_estimators': 600}. Best is trial 12 with value: 17739190.59993008.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:52,002] Trial 15 finished with value: 18402106.382307265 and parameters: {'learning_rate': 0.01067595618097364, 'max_depth': 4, 'n_estimators': 700}. Best is trial 12 with value: 17739190.59993008.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:52,494] Trial 16 finished with value: 22008760.322187264 and parameters: {'learning_rate': 0.05186524617285732, 'max_depth': 9, 'n_estimators': 500}. Best is trial 12 with value: 17739190.59993008.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:52,845] Trial 17 finished with value: 19147433.889891505 and parameters: {'learning_rate': 0.02061764863334883

  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:54,427] Trial 30 finished with value: 18179935.30083134 and parameters: {'learning_rate': 0.019948367375249937, 'max_depth': 4, 'n_estimators': 300}. Best is trial 12 with value: 17739190.59993008.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:54,561] Trial 31 finished with value: 17991108.107074037 and parameters: {'learning_rate': 0.016275507129028168, 'max_depth': 4, 'n_estimators': 300}. Best is trial 12 with value: 17739190.59993008.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:54,691] Trial 32 finished with value: 17968931.545351356 and parameters: {'learning_rate': 0.028845187080202932, 'max_depth': 3, 'n_estimators': 400}. Best is trial 12 with value: 17739190.59993008.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:54,909] Trial 33 finished with value

[I 2023-07-19 12:42:57,988] Trial 44 finished with value: 17930077.60995489 and parameters: {'learning_rate': 0.013402140313532013, 'max_depth': 3, 'n_estimators': 300}. Best is trial 41 with value: 17734530.148934007.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:58,126] Trial 45 finished with value: 18004716.968947776 and parameters: {'learning_rate': 0.013406825321474886, 'max_depth': 4, 'n_estimators': 300}. Best is trial 41 with value: 17734530.148934007.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:58,191] Trial 46 finished with value: 23293324.55491856 and parameters: {'learning_rate': 0.019160765578095297, 'max_depth': 3, 'n_estimators': 100}. Best is trial 41 with value: 17734530.148934007.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-19 12:42:58,274] Trial 47 finished with value: 17782890.00188679 and parameters: {'learning_rate': 0.02244727983273

[I 2023-07-19 12:43:00,234] Trial 59 finished with value: 18282409.530477643 and parameters: {'learning_rate': 0.03303868583940881, 'max_depth': 4, 'n_estimators': 100}. Best is trial 53 with value: 17684256.864174906.


Mejores hiperparámetros:
{'learning_rate': 0.01737353195796662, 'max_depth': 3, 'n_estimators': 300}
Mejor MSE obtenido: 17684256.864174906


In [48]:
#dividir conjunto de datos
X = data.drop('charges', axis = 1)
y = data['charges']

# División de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = study.best_params
model = LGBMRegressor(**params)

model.fit(X_train, y_train)
        
y_pred = model.predict(X_test)
        
mse = mean_squared_error(y_test, y_pred)


In [49]:
mse

17684256.864174906

In [14]:
import os
from joblib import dump, load

# Ruta del directorio actual (ajustar según la ubicación del script en el repositorio)
dir_path = os.path.join(os.getcwd(), "src")

# Ruta de la carpeta "models"
models_dir = os.path.join(dir_path, "..", "models")

# Crear la carpeta "models" si no existe
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# Ruta del archivo de modelo dentro de la carpeta "models"
model_path = os.path.join(models_dir, "xgb_predictor.joblib")

# Guardar el modelo en disco
dump(xgb_model, model_path)

['C:\\Users\\diego\\OneDrive\\Escritorio\\mlops_projects\\mlops\\insurance_online_api\\src\\..\\models\\xgb_predictor.joblib']