In [313]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

from mlxtend.regressor import StackingCVRegressor

import warnings
warnings.simplefilter("ignore")


In [314]:
df_population = pd.read_csv('./Data/population.csv')
df_population.head()

Unnamed: 0,country_name,country_code,year,population
0,Africa Eastern and Southern,AFE,2021,694665117.0
1,Africa Eastern and Southern,AFE,2020,677243299.0
2,Africa Eastern and Southern,AFE,2019,660046272.0
3,Africa Eastern and Southern,AFE,2018,643090131.0
4,Africa Eastern and Southern,AFE,2017,626392880.0


## Extraigo los códigos únicos de cada país

In [315]:
paises = df_population.country_code.unique()

## Creo el array con los años objetivo a predecir

In [316]:
objetivo = np.array([2023, 2024, 2025, 2026]).reshape(-1,1)
objetivo

array([[2023],
       [2024],
       [2025],
       [2026]])

## Creo un DataFrame preliminar en el que almacenaré todas las métricas y resultados

In [317]:
resultado = pd.DataFrame(columns = ["modelo", "country_name", "country_code", "2023", "2024", "2025", "2026", "SCORE", "MSE", "MAE", "MAPE", "RMSLE"])

## Creo instancias de los modelos de ML a evaluar

In [318]:
linear = LinearRegression()
elastic_net = ElasticNet()
random_forest = RandomForestRegressor()
gradient_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()

# Linear Regression

In [319]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    linear.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = linear.predict(X_train)
    y_test_pred = linear.predict(X_test)
    # Se obtienen las metricas
    SCORE = linear.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = linear.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Linear_Regression", df_pais.country_name, pais, linear.predict(objetivo)[0][0], linear.predict(objetivo)[1][0], linear.predict(objetivo)[2][0], linear.predict(objetivo)[3][0], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [320]:
resultado[resultado.modelo == 'Linear_Regression'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
0,Linear_Regression,0 Africa Eastern and Southern 1 Africa...,AFE,20.429453,20.457055,20.484658,20.512261,0.999591,0.0001,0.00876,0.045273,0.000494
1,Linear_Regression,62 Africa Eastern and Southern 63 Afri...,AFW,20.013641,20.040075,20.06651,20.092944,0.999523,0.000107,0.007993,0.042086,0.000523
2,Linear_Regression,124 Africa Western and Central 125 Afric...,ARB,20.046416,20.07316,20.099904,20.126648,0.988596,0.001513,0.032986,0.169856,0.001897
3,Linear_Regression,186 Arab World 187 ...,CSS,15.895023,15.90431,15.913598,15.922886,0.975256,0.000661,0.021425,0.137075,0.001546
4,Linear_Regression,248 Caribbean small states 249 ...,CEB,18.51243,18.513838,18.515246,18.516654,0.15,0.002394,0.04453,0.24145,0.002516


# Elastic Net

In [321]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    elastic_net.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = elastic_net.predict(X_train)
    y_test_pred = elastic_net.predict(X_test)
    # Se obtienen las metricas
    SCORE = elastic_net.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = elastic_net.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Elastic_Net", df_pais.country_name, pais, elastic_net.predict(objetivo)[0], elastic_net.predict(objetivo)[1], elastic_net.predict(objetivo)[2], elastic_net.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [322]:
resultado[resultado.modelo == 'Elastic_Net'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
261,Elastic_Net,0 Africa Eastern and Southern 1 Africa...,AFE,20.383016,20.409186,20.435356,20.461526,0.996769,0.000741,0.023799,2.715375,0.001341
262,Elastic_Net,62 Africa Eastern and Southern 63 Afri...,AFW,19.96475,19.989657,20.014565,20.039472,0.996582,0.000834,0.024922,2.897538,0.001429
263,Elastic_Net,124 Africa Western and Central 125 Afric...,ARB,19.991256,20.016075,20.040894,20.065712,0.992084,0.001785,0.036343,2.777086,0.002135
264,Elastic_Net,186 Arab World 187 ...,CSS,15.841084,15.848898,15.856711,15.864525,0.978185,0.000448,0.019349,1.019226,0.001276
265,Elastic_Net,248 Caribbean small states 249 ...,CEB,18.465372,18.465372,18.465372,18.465372,-0.042273,0.002785,0.040503,0.219758,0.002715


# Random Forest

In [323]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    random_forest.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = random_forest.predict(X_train)
    y_test_pred = random_forest.predict(X_test)
    # Se obtienen las metricas
    SCORE = random_forest.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = random_forest.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Random_Forest", df_pais.country_name, pais, random_forest.predict(objetivo)[0], random_forest.predict(objetivo)[1], random_forest.predict(objetivo)[2], random_forest.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [324]:
resultado[resultado.modelo == 'Random_Forest'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
522,Random_Forest,0 Africa Eastern and Southern 1 Africa...,AFE,20.316398,20.316398,20.316398,20.316398,0.997745,0.000435,0.01753,2.569245,0.00101
523,Random_Forest,62 Africa Eastern and Southern 63 Afri...,AFW,19.880393,19.880393,19.880393,19.880393,0.996967,0.000758,0.019583,2.932064,0.001331
524,Random_Forest,124 Africa Western and Central 125 Afric...,ARB,19.889079,19.889079,19.889079,19.889079,0.997758,0.00048,0.018784,2.787911,0.001096
525,Random_Forest,186 Arab World 187 ...,CSS,15.814012,15.814012,15.814012,15.814012,0.996802,7.4e-05,0.007327,1.120379,0.000519
526,Random_Forest,248 Caribbean small states 249 ...,CEB,18.439568,18.439568,18.439568,18.439568,0.991348,2.3e-05,0.003397,0.321088,0.000247


# Gradient Boost

In [325]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    gradient_boost.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = gradient_boost.predict(X_train)
    y_test_pred = gradient_boost.predict(X_test)
    # Se obtienen las metricas
    SCORE = gradient_boost.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = gradient_boost.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Gradient_Boost", df_pais.country_name, pais, gradient_boost.predict(objetivo)[0], gradient_boost.predict(objetivo)[1], gradient_boost.predict(objetivo)[2], gradient_boost.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [326]:
resultado[resultado.modelo == 'Gradient_Boost'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
783,Gradient_Boost,0 Africa Eastern and Southern 1 Africa...,AFE,20.358897,20.358897,20.358897,20.358897,0.993158,0.001404,0.034361,2.673541,0.00184
784,Gradient_Boost,62 Africa Eastern and Southern 63 Afri...,AFW,19.970112,19.970112,19.970112,19.970112,0.995457,0.001052,0.030727,2.876364,0.001603
785,Gradient_Boost,124 Africa Western and Central 125 Afric...,ARB,19.912462,19.912462,19.912462,19.912462,0.994679,0.00127,0.032379,2.91396,0.001798
786,Gradient_Boost,186 Arab World 187 ...,CSS,15.827947,15.827947,15.827947,15.827947,0.995334,0.000112,0.010025,1.140987,0.000641
787,Gradient_Boost,248 Caribbean small states 249 ...,CEB,18.437257,18.437257,18.437257,18.437257,0.967952,4.6e-05,0.005517,0.234605,0.00035


# Ada Boost

In [327]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    ada_boost.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = ada_boost.predict(X_train)
    y_test_pred = ada_boost.predict(X_test)
    # Se obtienen las metricas
    SCORE = ada_boost.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    # Se realiza la predicción para los proximos 4 años
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    y_test_pred = ada_boost.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Ada_Boost", df_pais.country_name, pais, ada_boost.predict(objetivo)[0], ada_boost.predict(objetivo)[1], ada_boost.predict(objetivo)[2], ada_boost.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [328]:
resultado[resultado.modelo == 'Ada_boost'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE


# Stacking CV Regressor

In [329]:
model = StackingCVRegressor( regressors = [linear, elastic_net, random_forest, gradient_boost, ada_boost],
    meta_regressor = elastic_net
)

##### Documentación mlxtend StackingCVRegressor 
##### https://rasbt.github.io/mlxtend/user_guide/regressor/StackingCVRegressor/


In [330]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = df_pais.population.values.reshape(-1,1) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    model.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    # Se obtienen las metricas
    SCORE = model.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = model.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Stacking", df_pais.country_name, pais, model.predict(objetivo)[0], model.predict(objetivo)[1], model.predict(objetivo)[2], model.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [331]:
resultado[resultado.modelo == 'Stacking'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
1305,Stacking,0 Africa Eastern and Southern 1 Africa...,AFE,685025000.0,685956400.0,686887900.0,687819300.0,0.999088,28251390000000.0,3987200.0,73.600319,0.02187
1306,Stacking,62 Africa Eastern and Southern 63 Afri...,AFW,458701100.0,459354300.0,460007500.0,460660700.0,0.992299,96413410000000.0,6805522.0,65.837047,0.031486
1307,Stacking,124 Africa Western and Central 125 Afric...,ARB,442538400.0,443314400.0,444090400.0,444866300.0,0.99804,15965230000000.0,2905662.0,46.413874,0.017152
1308,Stacking,186 Arab World 187 ...,CSS,7543738.0,7562220.0,7580702.0,7599183.0,0.997616,2611105000.0,39989.19,21.272008,0.010088
1309,Stacking,248 Caribbean small states 249 ...,CEB,101981200.0,101970800.0,101960400.0,101949900.0,0.9839,476097700000.0,476529.5,6.019395,0.007244


## Convierto los datos a CSV

In [332]:
resultado[['modelo', 'country_name', 'country_code', '2023', '2024', '2025', '2026']].to_csv('./Data/Output/forecasts.csv', index = False)

In [333]:
resultado[['modelo', 'country_name', 'country_code', 'MSE', 'MAE', 'MAPE']].to_csv('./Data/Output/errors.csv', index = False)

# Metricas

In [334]:
errors = pd.read_csv('./Data/Output/errors.csv')
forecast = pd.read_csv('./Data/Output/forecasts.csv')

In [335]:
modelos = errors.modelo.unique()
last = pd.DataFrame(columns = ['modelo', 'MSE', 'MAE', 'MAPE'])
for modelo in modelos:
    df_modelo = errors[errors['modelo'] == modelo]
    MSE = df_modelo['MSE'].mean()
    MAE = df_modelo['MAE'].mean()
    MAPE = df_modelo['MAPE'].mean()
    to_append = [modelo, MSE, MAE, MAPE]
    a_series = pd.Series(to_append, index = last.columns)
    last = last.append(a_series, ignore_index = True)

# Modelo con MSE promedio más bajo

In [337]:
last.sort_values(by = ['MSE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
2,Random_Forest,0.0005765948,0.01505896,2.427654
3,Gradient_Boost,0.0009373702,0.02259523,2.441043
4,Ada_Boost,0.001379359,0.02525302,2.45746
0,Linear_Regression,0.005103373,0.04570541,0.325165
1,Elastic_Net,0.006072235,0.05293193,2.363349
5,Stacking,50596830000000.0,1788236.0,42.687701


In [338]:
print(6.670620e-04)

0.000667062


# Modelo con MAE promedio más bajo

In [340]:
last.sort_values(by = ['MAE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
2,Random_Forest,0.0005765948,0.01505896,2.427654
3,Gradient_Boost,0.0009373702,0.02259523,2.441043
4,Ada_Boost,0.001379359,0.02525302,2.45746
0,Linear_Regression,0.005103373,0.04570541,0.325165
1,Elastic_Net,0.006072235,0.05293193,2.363349
5,Stacking,50596830000000.0,1788236.0,42.687701


In [341]:
print(1.540190e-02)

0.0154019


# Modelo con MAPE promedio más bajo

In [343]:
last.sort_values(by = ['MAPE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
0,Linear_Regression,0.005103373,0.04570541,0.325165
1,Elastic_Net,0.006072235,0.05293193,2.363349
2,Random_Forest,0.0005765948,0.01505896,2.427654
3,Gradient_Boost,0.0009373702,0.02259523,2.441043
4,Ada_Boost,0.001379359,0.02525302,2.45746
5,Stacking,50596830000000.0,1788236.0,42.687701
