In [444]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter("ignore")


In [445]:
df_population = pd.read_csv('./Data/population.csv')
df_population.head()

Unnamed: 0,country_name,country_code,year,population
0,Africa Eastern and Southern,AFE,2021,694665117.0
1,Africa Eastern and Southern,AFE,2020,677243299.0
2,Africa Eastern and Southern,AFE,2019,660046272.0
3,Africa Eastern and Southern,AFE,2018,643090131.0
4,Africa Eastern and Southern,AFE,2017,626392880.0


# Regresion

In [446]:
linear = LinearRegression()
elastic_net = ElasticNet()
random_forest = RandomForestRegressor()
gradient_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()

In [447]:
model = StackingCVRegressor( regressors = [linear, elastic_net, random_forest, gradient_boost, ada_boost],
    meta_regressor = elastic_net
)

##### Documentación mlxtend StackingCVRegressor 
##### https://rasbt.github.io/mlxtend/user_guide/regressor/StackingCVRegressor/


In [448]:
paises = df_population.country_code.unique()

In [449]:
resultado = pd.DataFrame(columns = ["country_code", "2023", "2024", "2025", "2026", "SCORE", "MSE", "MAE", "MAPE", "RMSLE"])

In [450]:
prueba = np.array([2023, 2024, 2025, 2026]).reshape(-1,1)
prueba

array([[2023],
       [2024],
       [2025],
       [2026]])

In [451]:
for pais in paises:
    df_pais = df_population.loc[(df_population['country_code'] == pais)]
    X = df_pais.year.values.reshape(-1,1)
    y = df_pais.population.values.reshape(-1,1) # np.log1p()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    SCORE = model.score(X_test, y_test)
    RMSLE = mean_squared_log_error(y_test, y_test_pred, squared = False)
    MSE = mean_squared_error(y_test, y_test_pred)
    MAE = mean_absolute_error(y_test, y_test_pred)
    MAPE = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
    y_test_pred = model.predict(prueba)
    to_append = [pais, model.predict(prueba)[0], model.predict(prueba)[1], model.predict(prueba)[2], model.predict(prueba)[3], SCORE, MSE, MAE, MAPE, RMSLE]
    a_series = pd.Series(to_append, index = resultado.columns)
    resultado = resultado.append(a_series, ignore_index = True)

In [456]:
resultado.tail()

Unnamed: 0,country_code,2023,2024,2025,2026,SCORE,MSE,MAE,MAPE,RMSLE
256,VIR,107423.7,107416.4,107409.2,107401.9,0.976473,11209150.0,1885.24007,31.224467,0.045805
257,PSE,5014317.0,5113738.0,5213159.0,5312580.0,0.997065,1723059000.0,33460.522172,28.508759,0.013367
258,YEM,29827680.0,29831010.0,29834340.0,29837670.0,0.995889,269094600000.0,380696.440674,75.230213,0.026181
259,ZMB,18586660.0,18617410.0,18648150.0,18678900.0,0.996813,53357060000.0,155670.000097,62.677563,0.022606
260,ZWE,14913260.0,14956170.0,14999090.0,15042000.0,0.995527,54454110000.0,184095.616216,53.102324,0.028011


In [453]:
resultado['MSE'][0]

88614210865231.06

In [454]:
resultado['2023'][0], resultado['2024'][0], resultado['2025'][0], resultado['2026'][0]

(688322789.9836468, 689049276.2830535, 689775762.5824603, 690502248.881867)

In [455]:
break

SyntaxError: 'break' outside loop (<ipython-input-455-6aaf1f276005>, line 1)

In [None]:
resultado[['country_code', '2023', '2024', '2025', '2026']].to_csv('./Data/Output/forecasts.csv', index = False)

In [None]:
resultado[['country_code', 'MSE', 'MAE', 'MAPE']].to_csv('./Data/Output/errors.csv', index = False)