In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

from mlxtend.regressor import StackingCVRegressor

import warnings
warnings.simplefilter("ignore")


In [3]:
df_population = pd.read_csv('./Data/population.csv')
df_population.head()

Unnamed: 0,country_name,country_code,year,population
0,Africa Eastern and Southern,AFE,2021,694665117.0
1,Africa Eastern and Southern,AFE,2020,677243299.0
2,Africa Eastern and Southern,AFE,2019,660046272.0
3,Africa Eastern and Southern,AFE,2018,643090131.0
4,Africa Eastern and Southern,AFE,2017,626392880.0


## Extraigo los códigos únicos de cada país

In [4]:
paises = df_population.country_code.unique()

## Creo el array con los años objetivo a predecir

In [5]:
objetivo = np.array([2023, 2024, 2025, 2026]).reshape(-1,1)
objetivo

array([[2023],
       [2024],
       [2025],
       [2026]])

## Creo un DataFrame preliminar en el que almacenaré todas las métricas y resultados

In [6]:
resultado = pd.DataFrame(columns = ["modelo", "country_name", "country_code", "2023", "2024", "2025", "2026", "SCORE", "MSE", "MAE", "MAPE", "RMSLE"])

## Creo instancias de los modelos de ML a evaluar

In [7]:
linear = LinearRegression()
elastic_net = ElasticNet()
random_forest = RandomForestRegressor()
gradient_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()

# Linear Regression

In [8]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    linear.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = linear.predict(X_train)
    y_test_pred = linear.predict(X_test)
    # Se obtienen las metricas
    SCORE = linear.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = linear.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Linear_Regression", df_pais.country_name, pais, linear.predict(objetivo)[0][0], linear.predict(objetivo)[1][0], linear.predict(objetivo)[2][0], linear.predict(objetivo)[3][0], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [9]:
resultado[resultado.modelo == 'Linear_Regression'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
0,Linear_Regression,0 Africa Eastern and Southern 1 Africa...,AFE,20.432668,20.460388,20.488108,20.515828,0.999567,0.000102,0.008625,0.043845,0.000487
1,Linear_Regression,62 Africa Eastern and Southern 63 Afri...,AFW,20.014577,20.04101,20.067442,20.093875,0.999554,9.9e-05,0.00792,0.041812,0.000503
2,Linear_Regression,124 Africa Western and Central 125 Afric...,ARB,20.032033,20.058289,20.084545,20.110801,0.994507,0.001061,0.028012,0.146092,0.00161
3,Linear_Regression,186 Arab World 187 ...,CSS,15.887579,15.896595,15.905612,15.914629,0.979551,0.00065,0.02061,0.13257,0.001544
4,Linear_Regression,248 Caribbean small states 249 ...,CEB,18.517666,18.519248,18.52083,18.522411,0.103167,0.001888,0.035883,0.194502,0.002233


# Elastic Net

In [10]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    elastic_net.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = elastic_net.predict(X_train)
    y_test_pred = elastic_net.predict(X_test)
    # Se obtienen las metricas
    SCORE = elastic_net.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = elastic_net.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Elastic_Net", df_pais.country_name, pais, elastic_net.predict(objetivo)[0], elastic_net.predict(objetivo)[1], elastic_net.predict(objetivo)[2], elastic_net.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [11]:
resultado[resultado.modelo == 'Elastic_Net'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
261,Elastic_Net,0 Africa Eastern and Southern 1 Africa...,AFE,20.378352,20.404465,20.430577,20.456689,0.996634,0.000812,0.025963,2.803665,0.001393
262,Elastic_Net,62 Africa Eastern and Southern 63 Afri...,AFW,19.97342,19.99852,20.023621,20.048722,0.996702,0.000595,0.020415,2.503679,0.001218
263,Elastic_Net,124 Africa Western and Central 125 Afric...,ARB,19.983819,20.008677,20.033534,20.058391,0.990187,0.001871,0.03894,2.536029,0.002166
264,Elastic_Net,186 Arab World 187 ...,CSS,15.83241,15.839911,15.847412,15.854913,0.945869,0.001252,0.029742,1.015098,0.00215
265,Elastic_Net,248 Caribbean small states 249 ...,CEB,18.457497,18.457497,18.457497,18.457497,-0.027493,0.002962,0.047124,0.255307,0.002799


# Random Forest

In [12]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    random_forest.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = random_forest.predict(X_train)
    y_test_pred = random_forest.predict(X_test)
    # Se obtienen las metricas
    SCORE = random_forest.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = random_forest.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Random_Forest", df_pais.country_name, pais, random_forest.predict(objetivo)[0], random_forest.predict(objetivo)[1], random_forest.predict(objetivo)[2], random_forest.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [13]:
resultado[resultado.modelo == 'Random_Forest'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
522,Random_Forest,0 Africa Eastern and Southern 1 Africa...,AFE,20.265636,20.265636,20.265636,20.265636,0.99466,0.001275,0.028307,2.81423,0.001699
523,Random_Forest,62 Africa Eastern and Southern 63 Afri...,AFW,19.940931,19.940931,19.940931,19.940931,0.997992,0.000401,0.016797,2.671672,0.000996
524,Random_Forest,124 Africa Western and Central 125 Afric...,ARB,19.863738,19.863738,19.863738,19.863738,0.994609,0.000797,0.02346,2.29018,0.00139
525,Random_Forest,186 Arab World 187 ...,CSS,15.819065,15.819065,15.819065,15.819065,0.995747,0.000124,0.007678,1.248869,0.000679
526,Random_Forest,248 Caribbean small states 249 ...,CEB,18.442347,18.442347,18.442347,18.442347,0.98764,2e-05,0.003425,0.236523,0.000231


# Gradient Boost

In [14]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    gradient_boost.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = gradient_boost.predict(X_train)
    y_test_pred = gradient_boost.predict(X_test)
    # Se obtienen las metricas
    SCORE = gradient_boost.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = gradient_boost.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Gradient_Boost", df_pais.country_name, pais, gradient_boost.predict(objetivo)[0], gradient_boost.predict(objetivo)[1], gradient_boost.predict(objetivo)[2], gradient_boost.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [15]:
resultado[resultado.modelo == 'Gradient_Boost'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
783,Gradient_Boost,0 Africa Eastern and Southern 1 Africa...,AFE,20.358876,20.358876,20.358876,20.358876,0.986365,0.001974,0.039848,2.255768,0.002168
784,Gradient_Boost,62 Africa Eastern and Southern 63 Afri...,AFW,19.944093,19.944093,19.944093,19.944093,0.996722,0.000923,0.029278,3.184096,0.001511
785,Gradient_Boost,124 Africa Western and Central 125 Afric...,ARB,19.912459,19.912459,19.912459,19.912459,0.995309,0.000981,0.029531,2.730412,0.001552
786,Gradient_Boost,186 Arab World 187 ...,CSS,15.827948,15.827948,15.827948,15.827948,0.992407,0.000224,0.012525,1.277733,0.000912
787,Gradient_Boost,248 Caribbean small states 249 ...,CEB,18.445774,18.445774,18.445774,18.445774,0.984426,4.4e-05,0.00542,0.328832,0.000341


# Ada Boost

In [16]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = np.log1p(df_pais.population.values.reshape(-1,1)) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    ada_boost.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = ada_boost.predict(X_train)
    y_test_pred = ada_boost.predict(X_test)
    # Se obtienen las metricas
    SCORE = ada_boost.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    # Se realiza la predicción para los proximos 4 años
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    y_test_pred = ada_boost.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Ada_Boost", df_pais.country_name, pais, ada_boost.predict(objetivo)[0], ada_boost.predict(objetivo)[1], ada_boost.predict(objetivo)[2], ada_boost.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [17]:
resultado[resultado.modelo == 'Ada_Boost'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
1044,Ada_Boost,0 Africa Eastern and Southern 1 Africa...,AFE,20.33338,20.33338,20.33338,20.33338,0.997162,0.000818,0.023384,3.145197,0.001387
1045,Ada_Boost,62 Africa Eastern and Southern 63 Afri...,AFW,19.911193,19.911193,19.911193,19.911193,0.995293,0.001193,0.028173,2.997588,0.001696
1046,Ada_Boost,124 Africa Western and Central 125 Afric...,ARB,19.855219,19.855219,19.855219,19.855219,0.991575,0.001539,0.032483,2.579474,0.001955
1047,Ada_Boost,186 Arab World 187 ...,CSS,15.802943,15.802943,15.802943,15.802943,0.990932,0.00022,0.01172,1.139606,0.000897
1048,Ada_Boost,248 Caribbean small states 249 ...,CEB,18.450217,18.450217,18.450217,18.450217,0.981109,5.3e-05,0.006382,0.302265,0.000373


# Stacking CV Regressor

In [18]:
model = StackingCVRegressor( regressors = [linear, elastic_net, random_forest, gradient_boost, ada_boost],
    meta_regressor = elastic_net
)

##### Documentación mlxtend StackingCVRegressor 
##### https://rasbt.github.io/mlxtend/user_guide/regressor/StackingCVRegressor/


In [19]:
for pais in paises:
    # Divido el dataframe por países, ya que se observó un mejor desempeño de los modelos en el contexto de cada país y su propia tendencia
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    # Separo las variables
    X = df_pais.year.values.reshape(-1,1)
    y = df_pais.population.values.reshape(-1,1) # np.log1p()
    # Divido en datos de train y de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    # Se entrena el modelo
    model.fit(X_train,y_train)
    # Se obtiene la predicción con los datos de train y test
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    # Se obtienen las metricas
    SCORE = model.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    # Se realiza la predicción para los proximos 4 años
    y_test_pred = model.predict(objetivo)
    # Se ingresan las métricas y resultados al DataFrame
    to_append = ["Stacking", df_pais.country_name, pais, model.predict(objetivo)[0], model.predict(objetivo)[1], model.predict(objetivo)[2], model.predict(objetivo)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [20]:
resultado[resultado.modelo == 'Stacking'].head()

Unnamed: 0,modelo,country_name,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
1305,Stacking,0 Africa Eastern and Southern 1 Africa...,AFE,696908600.0,697451700.0,697994900.0,698538100.0,0.997888,46456200000000.0,5546492.0,60.909685,0.024872
1306,Stacking,62 Africa Eastern and Southern 63 Afri...,AFW,466276200.0,466437400.0,466598600.0,466759800.0,0.998224,22186140000000.0,4003627.0,61.804326,0.022417
1307,Stacking,124 Africa Western and Central 125 Afric...,ARB,437302700.0,438411200.0,439519800.0,440628300.0,0.996415,40179380000000.0,4998191.0,64.835822,0.031821
1308,Stacking,186 Arab World 187 ...,CSS,7569246.0,7591521.0,7613796.0,7636071.0,0.9982,1650620000.0,31677.62,18.895541,0.006239
1309,Stacking,248 Caribbean small states 249 ...,CEB,101366100.0,101354600.0,101343100.0,101331600.0,0.991563,238565200000.0,384613.4,5.734996,0.0049


## Convierto los datos a CSV

In [21]:
resultado[['modelo', 'country_name', 'country_code', '2023', '2024', '2025', '2026']].to_csv('./Data/Output/forecasts.csv', index = False)

In [22]:
resultado[['modelo', 'country_name', 'country_code', 'MSE', 'MAE', 'MAPE']].to_csv('./Data/Output/errors.csv', index = False)

# Metricas

In [23]:
errors = pd.read_csv('./Data/Output/errors.csv')
forecast = pd.read_csv('./Data/Output/forecasts.csv')

In [24]:
modelos = errors.modelo.unique()
last = pd.DataFrame(columns = ['modelo', 'MSE', 'MAE', 'MAPE'])
for modelo in modelos:
    df_modelo = errors[errors['modelo'] == modelo]
    MSE = df_modelo['MSE'].mean()
    MAE = df_modelo['MAE'].mean()
    MAPE = df_modelo['MAPE'].mean()
    to_append = [modelo, MSE, MAE, MAPE]
    a_series = pd.Series(to_append, index = last.columns)
    last = last.append(a_series, ignore_index = True)

# Modelo con MSE promedio más bajo

In [25]:
last.sort_values(by = ['MSE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
2,Random_Forest,0.0005740362,0.01501141,2.407344
3,Gradient_Boost,0.0009374367,0.02217585,2.433993
4,Ada_Boost,0.001386085,0.02510303,2.440961
0,Linear_Regression,0.005453131,0.04657248,0.332426
1,Elastic_Net,0.00584354,0.05310291,2.33842
5,Stacking,45281920000000.0,1761898.0,44.497938


In [26]:
print()




# Modelo con MAE promedio más bajo

In [27]:
last.sort_values(by = ['MAE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
2,Random_Forest,0.0005740362,0.01501141,2.407344
3,Gradient_Boost,0.0009374367,0.02217585,2.433993
4,Ada_Boost,0.001386085,0.02510303,2.440961
0,Linear_Regression,0.005453131,0.04657248,0.332426
1,Elastic_Net,0.00584354,0.05310291,2.33842
5,Stacking,45281920000000.0,1761898.0,44.497938


In [28]:
print()




# Modelo con MAPE promedio más bajo

In [29]:
last.sort_values(by = ['MAPE'])

Unnamed: 0,modelo,MSE,MAE,MAPE
0,Linear_Regression,0.005453131,0.04657248,0.332426
1,Elastic_Net,0.00584354,0.05310291,2.33842
2,Random_Forest,0.0005740362,0.01501141,2.407344
3,Gradient_Boost,0.0009374367,0.02217585,2.433993
4,Ada_Boost,0.001386085,0.02510303,2.440961
5,Stacking,45281920000000.0,1761898.0,44.497938
