In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

from mlxtend.regressor import StackingCVRegressor

import warnings
warnings.simplefilter("ignore")


In [2]:
df_population = pd.read_csv('./Data/population.csv')
df_population.head()

Unnamed: 0,country_name,country_code,year,population
0,Africa Eastern and Southern,AFE,2021,694665117.0
1,Africa Eastern and Southern,AFE,2020,677243299.0
2,Africa Eastern and Southern,AFE,2019,660046272.0
3,Africa Eastern and Southern,AFE,2018,643090131.0
4,Africa Eastern and Southern,AFE,2017,626392880.0


## Extraigo los códigos únicos de cada país

In [3]:
paises = df_population.country_code.unique()

## Creo el array con los años objetivo a predecir

In [4]:
objetivo = np.array([2023, 2024, 2025, 2026]).reshape(-1,1)
objetivo

array([[2023],
       [2024],
       [2025],
       [2026]])

## Creo un DataFrame preliminar en el que almacenaré todas las métricas y resultados

In [5]:
resultado = pd.DataFrame(columns = ["modelo", "country_name", "country_code", "2023", "2024", "2025", "2026", "SCORE", "MSE", "MAE", "MAPE", "RMSLE"])

## Creo instancias de los modelos de ML a evaluar

In [6]:
linear = LinearRegression()
elastic_net = ElasticNet()
random_forest = RandomForestRegressor()
gradient_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()

# Linear Regression

In [7]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    linear.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = linear.predict(X_train)
    y_test_pred = linear.predict(X_test)
    # Se obtienen las metricas
    SCORE = linear.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = linear.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Linear_Regression", df_pais.country_name, pais, linear.predict(objetivo)[0][0], linear.predict(objetivo)[1][0], linear.predict(objetivo)[2][0], linear.predict(objetivo)[3][0], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [8]:
resultado[resultado.modelo == 'Linear_Regression'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
0,Linear_Regression,0 Africa Eastern and Southern 1 Africa...,AFE,20.430954,20.458632,20.48631,20.513987,0.999587,0.000111,0.0092,0.046921,0.000511
1,Linear_Regression,62 Africa Eastern and Southern 63 Afri...,AFW,20.017781,20.044354,20.070927,20.0975,0.999106,0.000194,0.009921,0.052929,0.000714
2,Linear_Regression,124 Africa Western and Central 125 Afric...,ARB,20.025483,20.051749,20.078015,20.104281,0.991376,0.001271,0.029452,0.153394,0.001765
3,Linear_Regression,186 Arab World 187 ...,CSS,15.878549,15.88729,15.896031,15.904772,0.9708,0.000889,0.021603,0.139772,0.00182
4,Linear_Regression,248 Caribbean small states 249 ...,CEB,18.513176,18.514743,18.51631,18.517876,0.128735,0.001965,0.039057,0.211582,0.002277


# Elastic Net

In [9]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    elastic_net.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = elastic_net.predict(X_train)
    y_test_pred = elastic_net.predict(X_test)
    # Se obtienen las metricas
    SCORE = elastic_net.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = elastic_net.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Elastic_Net", df_pais.country_name, pais, elastic_net.predict(objetivo)[0], elastic_net.predict(objetivo)[1], elastic_net.predict(objetivo)[2], elastic_net.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [10]:
resultado[resultado.modelo == 'Elastic_Net'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
261,Elastic_Net,0 Africa Eastern and Southern 1 Africa...,AFE,20.377242,20.403257,20.429273,20.455289,0.995814,0.001068,0.02941,2.880396,0.001617
262,Elastic_Net,62 Africa Eastern and Southern 63 Afri...,AFW,19.963234,19.988144,20.013053,20.037963,0.996701,0.000827,0.023464,2.926118,0.001413
263,Elastic_Net,124 Africa Western and Central 125 Afric...,ARB,19.984961,20.009991,20.035021,20.06005,0.988504,0.0019,0.039708,2.366569,0.002159
264,Elastic_Net,186 Arab World 187 ...,CSS,15.831024,15.838152,15.845279,15.852407,0.915625,0.002253,0.035902,1.089351,0.002896
265,Elastic_Net,248 Caribbean small states 249 ...,CEB,18.468601,18.468601,18.468601,18.468601,-0.114033,0.003393,0.043296,0.23514,0.002999


# Random Forest

In [11]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    random_forest.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = random_forest.predict(X_train)
    y_test_pred = random_forest.predict(X_test)
    # Se obtienen las metricas
    SCORE = random_forest.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = random_forest.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Random_Forest", df_pais.country_name, pais, random_forest.predict(objetivo)[0], random_forest.predict(objetivo)[1], random_forest.predict(objetivo)[2], random_forest.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [12]:
resultado[resultado.modelo == 'Random_Forest'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
522,Random_Forest,0 Africa Eastern and Southern 1 Africa...,AFE,20.322877,20.322877,20.322877,20.322877,0.99558,0.001115,0.027526,2.906829,0.001653
523,Random_Forest,62 Africa Eastern and Southern 63 Afri...,AFW,19.897034,19.897034,19.897034,19.897034,0.994309,0.001175,0.027809,2.691591,0.001684
524,Random_Forest,124 Africa Western and Central 125 Afric...,ARB,19.877085,19.877085,19.877085,19.877085,0.997317,0.000685,0.022809,3.019932,0.001312
525,Random_Forest,186 Arab World 187 ...,CSS,15.8189,15.8189,15.8189,15.8189,0.993567,0.000168,0.009596,1.200256,0.000789
526,Random_Forest,248 Caribbean small states 249 ...,CEB,18.439844,18.439844,18.439844,18.439844,0.9898,2.3e-05,0.003897,0.289853,0.000245


# Gradient Boost

In [13]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    gradient_boost.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = gradient_boost.predict(X_train)
    y_test_pred = gradient_boost.predict(X_test)
    # Se obtienen las metricas
    SCORE = gradient_boost.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = gradient_boost.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Gradient_Boost", df_pais.country_name, pais, gradient_boost.predict(objetivo)[0], gradient_boost.predict(objetivo)[1], gradient_boost.predict(objetivo)[2], gradient_boost.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [14]:
resultado[resultado.modelo == 'Gradient_Boost'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
783,Gradient_Boost,0 Africa Eastern and Southern 1 Africa...,AFE,20.358896,20.358896,20.358896,20.358896,0.994487,0.00135,0.033754,2.903616,0.001794
784,Gradient_Boost,62 Africa Eastern and Southern 63 Afri...,AFW,19.94409,19.94409,19.94409,19.94409,0.996578,0.000826,0.027887,2.953184,0.001424
785,Gradient_Boost,124 Africa Western and Central 125 Afric...,ARB,19.912463,19.912463,19.912463,19.912463,0.991551,0.002043,0.038694,2.926035,0.002294
786,Gradient_Boost,186 Arab World 187 ...,CSS,15.822675,15.822675,15.822675,15.822675,0.993418,0.000179,0.011701,1.202215,0.00081
787,Gradient_Boost,248 Caribbean small states 249 ...,CEB,18.444411,18.444411,18.444411,18.444411,0.974088,3.7e-05,0.004992,0.219641,0.000312


# Ada Boost

In [15]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    ada_boost.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = ada_boost.predict(X_train)
    y_test_pred = ada_boost.predict(X_test)
    # Se obtienen las metricas
    SCORE = ada_boost.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    # Se realiza la predicción para los proximos 4 años
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    y_test_pred = ada_boost.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Ada_Boost", df_pais.country_name, pais, ada_boost.predict(objetivo)[0], ada_boost.predict(objetivo)[1], ada_boost.predict(objetivo)[2], ada_boost.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [16]:
resultado[resultado.modelo == 'Ada_Boost'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
1044,Ada_Boost,0 Africa Eastern and Southern 1 Africa...,AFE,20.307214,20.307214,20.307214,20.307214,0.994227,0.001148,0.026054,2.609573,0.001652
1045,Ada_Boost,62 Africa Eastern and Southern 63 Afri...,AFW,19.911144,19.911144,19.911144,19.911144,0.992,0.002143,0.037807,3.048615,0.002331
1046,Ada_Boost,124 Africa Western and Central 125 Afric...,ARB,19.874375,19.874375,19.874375,19.874375,0.994669,0.001401,0.032314,2.966344,0.001829
1047,Ada_Boost,186 Arab World 187 ...,CSS,15.81239,15.81239,15.81239,15.81239,0.987177,0.00031,0.012662,1.153763,0.001074
1048,Ada_Boost,248 Caribbean small states 249 ...,CEB,18.446178,18.446178,18.446178,18.446178,0.959519,7.9e-05,0.007682,0.262204,0.000455


# Stacking CV Regressor

In [17]:
model = StackingCVRegressor( regressors = [linear, elastic_net, random_forest, gradient_boost, ada_boost],
    meta_regressor = elastic_net
)

##### Documentación mlxtend StackingCVRegressor 
##### https://rasbt.github.io/mlxtend/user_guide/regressor/StackingCVRegressor/


In [18]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = df_pais.population.values.reshape(-1,1) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    model.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    # Se obtienen las metricas
    SCORE = model.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = model.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Stacking", df_pais.country_name, pais, model.predict(objetivo)[0], model.predict(objetivo)[1], model.predict(objetivo)[2], model.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [19]:
resultado[resultado.modelo == 'Stacking'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
1305,Stacking,0 Africa Eastern and Southern 1 Africa...,AFE,688380200.0,689350200.0,690320100.0,691290100.0,0.99881,28187690000000.0,3737374.0,65.618397,0.017527
1306,Stacking,62 Africa Eastern and Southern 63 Afri...,AFW,473018800.0,473281500.0,473544100.0,473806800.0,0.998529,15861620000000.0,3273539.0,60.824093,0.02081
1307,Stacking,124 Africa Western and Central 125 Afric...,ARB,443195300.0,443329700.0,443464000.0,443598400.0,0.998957,13556120000000.0,2873736.0,65.923482,0.021238
1308,Stacking,186 Arab World 187 ...,CSS,7521504.0,7547265.0,7573025.0,7598786.0,0.998538,1236009000.0,28671.83,18.143607,0.006306
1309,Stacking,248 Caribbean small states 249 ...,CEB,101750500.0,101740700.0,101730900.0,101721100.0,0.980978,356053200000.0,479870.4,4.596707,0.005615


## Convierto los datos a CSV

In [20]:
resultado[['modelo', 'country_code', '2023', '2024', '2025', '2026']].to_csv('./Data/Output/forecasts.csv', index = False)

In [21]:
resultado[['modelo', 'country_code', 'MSE', 'MAE', 'MAPE']].to_csv('./Data/Output/errors.csv', index = False)

# Metricas

In [22]:
errors = pd.read_csv('./Data/Output/errors.csv')
forecast = pd.read_csv('./Data/Output/forecasts.csv')

In [23]:
modelos = errors.modelo.unique()
last = pd.DataFrame(columns = ['modelo', 'MSE', 'MAE', 'MAPE'])
for modelo in modelos:
    df_modelo = errors[errors['modelo'] == modelo]
    MSE = df_modelo['MSE'].mean()
    MAE = df_modelo['MAE'].mean()
    MAPE = df_modelo['MAPE'].mean()
    to_append = [modelo, MSE, MAE, MAPE]
    a_series = pd.Series(to_append, index = last.columns)
    last = last.append(a_series, ignore_index = True)

# Modelo con MSE promedio más bajo

In [24]:
last.sort_values(by = ['MSE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
2,Random_Forest,0.0005594633,0.01486148,2.44682
3,Gradient_Boost,0.0009951317,0.02253479,2.436499
4,Ada_Boost,0.001315186,0.02463602,2.394345
0,Linear_Regression,0.005290901,0.04594359,0.328182
1,Elastic_Net,0.005829152,0.05321726,2.325831
5,Stacking,42718250000000.0,1663095.0,42.676646


In [25]:
print()




# Modelo con MAE promedio más bajo

In [26]:
last.sort_values(by = ['MAE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
2,Random_Forest,0.0005594633,0.01486148,2.44682
3,Gradient_Boost,0.0009951317,0.02253479,2.436499
4,Ada_Boost,0.001315186,0.02463602,2.394345
0,Linear_Regression,0.005290901,0.04594359,0.328182
1,Elastic_Net,0.005829152,0.05321726,2.325831
5,Stacking,42718250000000.0,1663095.0,42.676646


In [27]:
print()




# Modelo con MAPE promedio más bajo

In [28]:
last.sort_values(by = ['MAPE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
0,Linear_Regression,0.005290901,0.04594359,0.328182
1,Elastic_Net,0.005829152,0.05321726,2.325831
4,Ada_Boost,0.001315186,0.02463602,2.394345
3,Gradient_Boost,0.0009951317,0.02253479,2.436499
2,Random_Forest,0.0005594633,0.01486148,2.44682
5,Stacking,42718250000000.0,1663095.0,42.676646
