In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
print(df.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


## Prep / Feature Engineering

In [6]:
## TRANSFORMADOR AUXILIAR
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

In [7]:
# Funciones de Preprocesamiento:
def remove_duplicates(df):
    df.drop_duplicates(inplace=True)
    return df

def impute_missing_values(df):
    # Variables categóricas
    categorical_cols = df.select_dtypes(include='object').columns
    #categorical_cols = categorical_cols.drop('churn')  # Excluir 'churn'
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

    # Variables numéricas
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

    return df


def map_categorical_features(df):
    mapping = {
        'sex': {'female': 1, 'male':0},
        'smoker': {'yes': 1, 'no':0},
        'region': {'southwest': 0, 'southeast': 0.3, 'northwest':0.6, 'northeast': 1}
    }
    
    df.replace(mapping, inplace=True)

    return df

def normalize_numeric_features(df):
    numeric_cols = ['bmi', 'age', 'children']
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df


In [8]:
df = impute_missing_values(df)
df = remove_duplicates(df)
df = normalize_numeric_features(df)
df = map_categorical_features(df)

In [9]:
data = pd.DataFrame(df)

In [10]:
print(data.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [11]:
data = data.fillna(0)  # Reemplazar NaN por 0 u otro valor válido

### Optimización de Hiperparametros con Optuna

In [None]:
%pip install optuna xgboost

In [12]:
from sklearn.model_selection import StratifiedKFold #prb classificacion
from sklearn.model_selection import KFold #prb regression
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna
from sklearn.model_selection import train_test_split

In [16]:
# Función de optimización de hiperparámetros
def objective(trial):
    params ={
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
        'max_depth' : trial.suggest_int('max_depth', 3, 10),
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000, step=100),
    }
    
    df = impute_missing_values(data)
    df = remove_duplicates(df)
    df = normalize_numeric_features(df)
    fin = map_categorical_features(df)
    
    X = fin.drop('charges', axis=1)
    y = fin['charges']
    
    # División de datos en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
        
    mse = mean_squared_error(y_test, y_pred)

    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=60)

print("Mejores hiperparámetros:")
print(study.best_params)

best_mse = study.best_value
print("Mejor MSE obtenido:", best_mse)

[I 2023-07-19 12:14:10,456] A new study created in memory with name: no-name-c19f3b14-68c0-4da4-9f1d-d2b5c8fa63da
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[W 2023-07-19 12:14:10,541] Trial 0 failed with parameters: {'learning_rate': 0.11848705466487075, 'max_depth': 9, 'n_estimators': 500} because of the following error: ValueError('at least one array or dtype is required').
Traceback (most recent call last):
  File "C:\Users\diego\anaconda3\envs\curso\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\diego\AppData\Local\Temp\ipykernel_15776\3315664571.py", line 9, in objective
    df = impute_missing_values(data)
  File "C:\Users\diego\AppData\Local\Temp\ipykernel_15776\2937052664.py", line 11, in impute_missing_values
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])
  File "C:\Users\diego\anaconda3\envs\curso\lib\site-packages\sklearn\utils\_set_outp

ValueError: at least one array or dtype is required

In [13]:
#dividir conjunto de datos
X = data.drop('charges', axis = 1)
y = data['charges']

# División de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = study.best_params
xgb_model = XGBRegressor(**params)

xgb_model.fit(X_train, y_train)
        
y_pred = xgb_model.predict(X_test)
        
mse = mean_squared_error(y_test, y_pred)


In [14]:
import os
from joblib import dump, load

# Ruta del directorio actual (ajustar según la ubicación del script en el repositorio)
dir_path = os.path.join(os.getcwd(), "src")

# Ruta de la carpeta "models"
models_dir = os.path.join(dir_path, "..", "models")

# Crear la carpeta "models" si no existe
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# Ruta del archivo de modelo dentro de la carpeta "models"
model_path = os.path.join(models_dir, "xgb_predictor.joblib")

# Guardar el modelo en disco
dump(xgb_model, model_path)

['C:\\Users\\diego\\OneDrive\\Escritorio\\mlops_projects\\mlops\\insurance_online_api\\src\\..\\models\\xgb_predictor.joblib']