In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter("ignore")

In [2]:
df_population = pd.read_csv('./Data/population.csv')
df_population.head()

Unnamed: 0,country_name,country_code,year,population
0,Africa Eastern and Southern,AFE,2021,694665117.0
1,Africa Eastern and Southern,AFE,2020,677243299.0
2,Africa Eastern and Southern,AFE,2019,660046272.0
3,Africa Eastern and Southern,AFE,2018,643090131.0
4,Africa Eastern and Southern,AFE,2017,626392880.0


In [3]:
linear = LinearRegression()
elastic_net = ElasticNet()
random_forest = RandomForestRegressor()
gradient_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()

In [4]:
random_forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
# Cantidad de arboles
n_estimators = [10, 20, 40, 80]
# Numero maximo de niveles en el arbol
max_depth = [2, 4]

min_samples_split = [2, 4, 10]

min_samples_leaf = [1, 2]

bootstrap = [False, True]

In [6]:
parametros = {
    "n_estimators" : n_estimators,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "bootstrap" : bootstrap
}

rejilla = GridSearchCV(random_forest, parametros, n_jobs = -1)

In [7]:
df_population2 = df_population[df_population['country_code'] == 'AFE']

In [8]:
X = df_population2.year.values.reshape(-1,1)
y = df_population2.population.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
rejilla.fit(X_train,y_train)

In [9]:
sorted(rejilla.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_bootstrap',
 'param_max_depth',
 'param_min_samples_leaf',
 'param_min_samples_split',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [10]:
print(rejilla.cv_results_['rank_test_score'])
print(rejilla.cv_results_['mean_test_score'])
print(rejilla.best_score_)
print(rejilla.best_params_)

[73 81 85 73 73 81 85 73 89 89 95 89 73 81 85 73 73 81 85 73 89 89 95 89
 17 17 17 17 29 29 29 29 67 67 71 65 21 21 21 21 21 21 21 21 67 67 71 65
 44 40 35 34 48 36 43 37 49 64 60 58 41 42 33 39 46 47 45 38 63 62 61 59
  7  3  2  1 14 12  5  9 51 55 57 52 16 15  6  8 13 11 10  4 54 50 56 53]
[0.74097691 0.74097691 0.74097691 0.74097691 0.74097691 0.74097691
 0.74097691 0.74097691 0.71087565 0.71087565 0.71087565 0.71087565
 0.74097691 0.74097691 0.74097691 0.74097691 0.74097691 0.74097691
 0.74097691 0.74097691 0.71087565 0.71087565 0.71087565 0.71087565
 0.94899379 0.94899379 0.94899379 0.94899379 0.94244576 0.94244576
 0.94244576 0.94244576 0.79835688 0.79835688 0.79835688 0.79835688
 0.94470718 0.94470718 0.94470718 0.94470718 0.94470718 0.94470718
 0.94470718 0.94470718 0.79835688 0.79835688 0.79835688 0.79835688
 0.91557661 0.92766499 0.93930145 0.94027289 0.89490659 0.93640496
 0.9172641  0.9363615  0.87904838 0.8429127  0.85500386 0.86413251
 0.92447294 0.92035534 0.94053457 0.9

In [11]:
rejilla.best_params_['min_samples_leaf']

1

In [12]:
random_forest = RandomForestRegressor()

In [13]:
random_forest.fit(X_train,y_train)

In [14]:
# 0.9979928168103364
random_forest.score(X_test, y_test)

0.9958384914357521

In [15]:
y_test_pred = random_forest.predict(X_test)

In [16]:
# RMSLE 0.02247129207606141
mean_squared_log_error(y_test, y_test_pred, squared = False)

0.02150067133228575

In [17]:
# MSE 55479745262245.03
mean_squared_error(y_test, y_test_pred)

110348145699502.77

In [18]:
# MAE 5994656.62185001
mean_absolute_error(y_test, y_test_pred)

7622865.730800002

In [19]:
# MAPE 66.40142858967484
np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

61.99297434060136