Proyecto: Predicción de esperanza de vida usando ML y MLflow

In [1]:
# 🔁 1. Imports y configuración inicial
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
import kagglehub
from kagglehub import KaggleDatasetAdapter
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import joblib
warnings.filterwarnings('ignore')

  from google.protobuf import service as _service
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2. Cargar y explorar el dataset
df = pd.read_csv("Life Expectancy Data.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
# Verificar columnas y valores nulos
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

3. Preprocesamiento de datos

In [4]:
# Eliminar columnas no numéricas o innecesarias
df.drop(columns=["Country"], inplace=True)

In [5]:
# Rellenar nulos solo en columnas numéricas
for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].mean(), inplace=True)

In [6]:
# Convertir texto a números manualmente
df["Status"] = df["Status"].replace({"Developing": 0, "Developed": 1})

In [7]:
# Separar variables independientes y dependiente (target)
X = df.drop("Life expectancy ", axis=1)
y = df["Life expectancy "]

In [8]:
# Escalar variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

4. Entrenamiento con MLflow

In [10]:
mlflow.set_tracking_uri("http://localhost:9090")
mlflow.set_experiment("life-expectancy-random-search")

<Experiment: artifact_location='mlflow-artifacts:/424213572648594018', creation_time=1747629948064, experiment_id='424213572648594018', last_update_time=1747629948064, lifecycle_stage='active', name='life-expectancy-random-search', tags={}>

In [11]:
best_rmse = float("inf")
best_model = None
for i in range(5): 
    # Generar parámetros aleatorios
    n_estimators = random.choice([50, 100, 150, 200])
    max_depth = random.choice([5, 10, 15, None])
    min_samples_split = random.choice([2, 4, 6])
    min_samples_leaf = random.choice([1, 2, 3])
    bootstrap = random.choice([True, False])

    # Entrenamiento dentro de cada run
    with mlflow.start_run():
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_samples_split", min_samples_split)
        mlflow.log_param("min_samples_leaf", min_samples_leaf)
        mlflow.log_param("bootstrap", bootstrap)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)
        mlflow.sklearn.log_model(model, "model")

        print(f"Run {i+1}: RMSE={rmse:.2f}, R²={r2:.2f} | Params: n={n_estimators}, depth={max_depth}, split={min_samples_split}, leaf={min_samples_leaf}, boot={bootstrap}")
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model

Run 1: RMSE=2.44, R²=0.93 | Params: n=150, depth=5, split=2, leaf=2, boot=True
Run 2: RMSE=2.46, R²=0.93 | Params: n=150, depth=10, split=2, leaf=1, boot=False
Run 3: RMSE=2.43, R²=0.93 | Params: n=50, depth=5, split=2, leaf=2, boot=True
Run 4: RMSE=1.65, R²=0.97 | Params: n=100, depth=None, split=2, leaf=2, boot=True
Run 5: RMSE=1.65, R²=0.97 | Params: n=100, depth=None, split=4, leaf=1, boot=True


In [12]:
# 🧠 Guardar el mejor modelo encontrado
joblib.dump(best_model, "model_life_expectancy.pkl")
print(f"✅ Mejor modelo guardado con RMSE={best_rmse:.2f}")

✅ Mejor modelo guardado con RMSE=1.65
