In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [2]:
cleared_data = pd.read_excel(r'dataset/cleared_dataset/BRICS_all_areas_cleared_dataset.xlsx')
cleared_data = cleared_data.iloc[:, 1:]
cleared_data

Unnamed: 0,Adjusted net national income (annual % growth),Adjusted net national income (constant 2015 US$),Adjusted net national income (current US$),Adjusted net national income per capita (annual % growth),Adjusted net national income per capita (constant 2015 US$),Adjusted net national income per capita (current US$),"Adjusted net savings, excluding particulate emission damage (current US$)","Adjusted net savings, including particulate emission damage (current US$)",Adjusted savings: carbon dioxide damage (current US$),Adjusted savings: consumption of fixed capital (current US$),...,"Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth male (% of male labor force ages 15-24) (national estimate)","Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth total (% of total labor force ages 15-24) (national estimate)","Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)"
0,0.466468,0.023250,0.002405,0.457415,0.404815,0.029051,0.032175,0.044063,0.000099,0.000515,...,0.097264,0.034784,0.128246,0.031900,0.353787,0.417618,0.399133,0.630320,0.535293,0.565233
1,0.474331,0.026944,0.003136,0.465886,0.447581,0.035074,0.032175,0.044063,0.000224,0.000844,...,0.097264,0.034784,0.128246,0.031900,0.353787,0.417618,0.399133,0.630320,0.535293,0.565233
2,0.470348,0.030977,0.004704,0.462350,0.492994,0.048522,0.032175,0.044063,0.000421,0.001627,...,0.097264,0.034784,0.128246,0.031900,0.353787,0.417618,0.399133,0.630320,0.535293,0.565233
3,0.392740,0.032909,0.006695,0.385312,0.507183,0.065131,0.032175,0.044063,0.000625,0.002527,...,0.097264,0.034784,0.128246,0.031900,0.353787,0.417618,0.399133,0.630320,0.535293,0.565233
4,0.370858,0.034175,0.008052,0.363500,0.511439,0.075355,0.032175,0.044063,0.000836,0.003249,...,0.097264,0.034784,0.128246,0.031900,0.353787,0.417618,0.399133,0.630320,0.535293,0.565233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,0.331415,0.016029,0.019158,0.334386,0.544406,0.337768,0.027293,0.027860,0.038400,0.016573,...,0.867180,0.844326,0.879691,0.860416,0.102488,0.107444,0.107727,0.869680,0.802576,0.830064
250,0.364604,0.016629,0.022937,0.368325,0.551393,0.395443,0.028351,0.028878,0.039675,0.019051,...,0.871438,0.848698,0.876980,0.857772,0.108237,0.109801,0.111820,0.862659,0.800510,0.825928
251,0.330632,0.016546,0.024108,0.334825,0.543768,0.409095,0.027044,0.027525,0.040959,0.020352,...,0.877922,0.854971,0.880779,0.861473,0.108458,0.118486,0.116514,0.867486,0.793950,0.824746
252,0.339407,0.016643,0.022847,0.344179,0.537904,0.383322,0.026512,0.027018,0.041808,0.019620,...,0.958079,0.932332,0.945598,0.925626,0.112327,0.122705,0.120727,0.859039,0.784473,0.815646


Division into training and test samples

In [3]:
test_size = 0.2
seed = 7
target_feature_name = 'GDP per capita (current US$)'

data_X = cleared_data.drop([target_feature_name], axis=1)
data_Y = cleared_data[target_feature_name]

data_X_train, data_X_test, data_Y_train, data_Y_test = train_test_split(data_X, data_Y, test_size=test_size, random_state=seed)

SVR model

Define grid search params for SVR

In [4]:
SVR_greed_search_params = {
    'C': [i for i in np.linspace(0.1, 50, num=100)],
    'kernel': ['linear', 'poly'],
    'degree': [i for i in range(2, 20)],
    'gamma' : ['scale', 'auto']
}

In [5]:
svr_model = SVR()

grid_search = GridSearchCV(svr_model, SVR_greed_search_params, n_jobs=4)
grid_search.fit(data_X_train, data_Y_train)

best_params = grid_search.best_params_
print(f'Best hyper parameters: {best_params}')

best_svr_model = SVR(
    C=best_params.get('C'),
    kernel=best_params.get('kernel')
)
best_svr_model.fit(data_X_train, data_Y_train)

test_predict = best_svr_model.predict(data_X_test)
rmse = np.sqrt(mean_squared_error(data_Y_test, test_predict))
r2 = r2_score(data_Y_test, test_predict)
print(f'RMSE for SVR: ', rmse)
print(f'R2_score for SVR: ', r2)

Best hyper parameters: {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
RMSE for SVR:  0.05613233798100877
R2_score for SVR:  0.8805023350012436


Random forest model

Define grid search params for Random forest model

In [6]:
random_forest_greed_search_params = {
    'n_estimators': [i for i in range(100, 1001, 100)],
    'criterion' : ["squared_error", "absolute_error", "friedman_mse", "poisson"],
}

In [7]:
random_forest_model = RandomForestRegressor()

grid_search = GridSearchCV(random_forest_model, random_forest_greed_search_params, n_jobs=4)
grid_search.fit(data_X_train, data_Y_train)

best_params = grid_search.best_params_
print(f'Best hyper parameters: {best_params}')

best_random_forest_model = RandomForestRegressor(
    n_estimators=best_params.get('n_estimators'),
    criterion=best_params.get('criterion')
)
best_random_forest_model.fit(data_X_train, data_Y_train)

test_predict = best_random_forest_model.predict(data_X_test)
rmse = np.sqrt(mean_squared_error(data_Y_test, test_predict))
r2 = r2_score(data_Y_test, test_predict)
print(f'RMSE for SVR: ', rmse)
print(f'R2_score for SVR: ', r2)

Best hyper parameters: {'criterion': 'friedman_mse', 'n_estimators': 100}
RMSE for SVR:  0.01543556635792298
R2_score for SVR:  0.990963956884972
