In [180]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import warnings
warnings.simplefilter("ignore")


In [181]:
df_population = pd.read_csv('./Data/population.csv')
df_population.head()

Unnamed: 0,country_name,country_code,year,population
0,Africa Eastern and Southern,AFE,2021,694665117.0
1,Africa Eastern and Southern,AFE,2020,677243299.0
2,Africa Eastern and Southern,AFE,2019,660046272.0
3,Africa Eastern and Southern,AFE,2018,643090131.0
4,Africa Eastern and Southern,AFE,2017,626392880.0


# Regresion

In [182]:
linear = LinearRegression()
elastic_net = ElasticNet()
random_forest = RandomForestRegressor()
gradient_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()

In [183]:
model = StackingCVRegressor( regressors = [linear, elastic_net, random_forest, gradient_boost, ada_boost],
    meta_regressor = elastic_net
)

In [184]:
paises = df_population.country_code.unique()

In [185]:
resultado = pd.DataFrame(columns = ["country_code", "2023", "2024", "2025", "2026", "MSE", "MAE", "MAPE"])

In [186]:
prueba = np.array([2023, 2024, 2025, 2026]).reshape(-1,1)
prueba

array([[2023],
       [2024],
       [2025],
       [2026]])

In [187]:
for pais in paises:
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    X = df_pais.year.values.reshape(-1,1)
    y = df_pais.population.values.reshape(-1,1) # np.log1p()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    linear.fit(X_train,y_train)
    y_train_pred = linear.predict(X_train)
    y_test_pred = linear.predict(X_test)
    mse = mean_squared_error(y_test, y_test_pred)

In [188]:
for pais in paises:
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    X = df_pais.year.values.reshape(-1,1)
    y = df_pais.population.values.reshape(-1,1) # np.log1p()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    y_test_pred = model.predict(prueba)
    to_append = [pais, model.predict(prueba)[0], model.predict(prueba)[1], model.predict(prueba)[2], model.predict(prueba)[3], MSE, MAE, MAPE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [189]:
resultado.head()

Unnamed: 0,country_code,2023,2024,2025,2026,MSE,MAE,MAPE
0,AFE,693300800.0,693701200.0,694101600.0,694502000.0,122386400000000.0,8465734.0,65.207951
1,AFW,470276300.0,470444300.0,470612300.0,470780300.0,16640720000000.0,2801082.0,62.232932
2,ARB,437900600.0,438896300.0,439892100.0,440887800.0,17118560000000.0,3081217.0,64.865245
3,CSS,7582967.0,7608034.0,7633101.0,7658167.0,1431380000.0,30064.05,18.296338
4,CEB,101911300.0,101895900.0,101880500.0,101865100.0,142716100000.0,307357.3,5.547222


In [201]:
# resultado[['country_code', '2023', '2024', '2025', '2026']].to_csv('./Data/Output/forecasts.csv', index = False)

In [202]:
# resultado[['country_code', 'MSE', 'MAE', 'MAPE']].to_csv('./Data/Output/errors.csv', index = False)