In [1]:
import wbgapi as wb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Obtener información de indicadores
wb.series.info(q = "life")

id,value
SH.MMR.RISK,Lifetime risk of maternal death (1 in: rate varies by country)
SH.MMR.RISK.ZS,Lifetime risk of maternal death (%)
SP.DYN.LE00.FE.IN,"Life expectancy at birth, female (years)"
SP.DYN.LE00.IN,"Life expectancy at birth, total (years)"
SP.DYN.LE00.MA.IN,"Life expectancy at birth, male (years)"
,5 elements


In [3]:
## Indicadores: PIB, Población y Tasa de mortalidad infantil
indicadores = ["NY.GDP.MKTP.CD",  # PIB (USD)
               "SP.POP.TOTL",      # Población total
               "SP.DYN.IMRT.IN"   # Tasa de mortalidad infantil (por 1,000 nacidos vivos)
               "NY.GDP.PCAP.CD",     # PIB per cápita (USD)
               "SP.URB.TOTL.IN.ZS",  # Población urbana (% del total)
               "SP.DYN.LE00.IN"]      # Esperanza de vida al nacer (años)

## Descargar datos para el periodo 2010-2020
datos_wb = wb.data.DataFrame(indicadores, time=range(2010, 2021), labels=True)

In [4]:
## Pivotear la tabla para tener los años como filas en lugar de columnas
datos_wb = datos_wb.melt(id_vars=["Country", "Series"], var_name="year", value_name="value")

In [5]:
## Pivotear las series para que cada indicador sea una columna
datos_wb_pivot = datos_wb.pivot_table(index=["Country", "year"], columns="Series", values="value").reset_index()

In [6]:
## Inspeccionar
datos_wb_pivot.head()

Series,Country,year,GDP (current US$),"Life expectancy at birth, total (years)","Population, total",Urban population (% of total population)
0,Afghanistan,YR2010,15856670000.0,60.851,28189672.0,23.737
1,Afghanistan,YR2011,17805100000.0,61.419,29249157.0,23.948
2,Afghanistan,YR2012,19907330000.0,61.923,30466479.0,24.16
3,Afghanistan,YR2013,20146420000.0,62.417,31541209.0,24.373
4,Afghanistan,YR2014,20497130000.0,62.545,32716210.0,24.587


In [7]:
## Renombrar las columnas para que sean más legibles
datos_wb_pivot.columns = ["country", "year", "gdp", "life_exp", "population", "urban_pop"]

In [8]:
## Inspeccionar
datos_wb_pivot.head()

Unnamed: 0,country,year,gdp,life_exp,population,urban_pop
0,Afghanistan,YR2010,15856670000.0,60.851,28189672.0,23.737
1,Afghanistan,YR2011,17805100000.0,61.419,29249157.0,23.948
2,Afghanistan,YR2012,19907330000.0,61.923,30466479.0,24.16
3,Afghanistan,YR2013,20146420000.0,62.417,31541209.0,24.373
4,Afghanistan,YR2014,20497130000.0,62.545,32716210.0,24.587


In [9]:
X = datos_wb_pivot.filter(items=[
    "gdp", 
    "population", 
    "urban_pop", 
])
y = datos_wb_pivot["life_exp"]  ## Outcome

In [10]:
## Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=86)

In [11]:
## Crear el pipeline de preprocesamiento y modelo
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=500, random_state=86))
pipeline

In [12]:
# Ajustar el modelo
pipeline.fit(X_train, y_train)

ValueError: Input y contains NaN.