In [11]:
from collections import namedtuple
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

Load all datasets

In [6]:
base_path = Path('dataset/dataset_versions')

datasets = {}
datasets_names = (
    'bfill_ffill',
    'linear_interpolation',
    'cubic_interpolation',
    'quadratic_interpolation',
    'polynomial_5_interpolation',
    'polynomial_7_interpolation',
    'polynomial_9_interpolation',
    'polynomial_11_interpolation',
)
for dataset_name in datasets_names:
    dataset = pd.read_excel(base_path / f'{dataset_name}_rescaled_dataset.xlsx')
    datasets[dataset_name] = dataset.iloc[:, 1:]

Division into training and test samples

In [7]:
test_size = 0.2
seed = 7
target_feature_name = 'GDP per capita (current US$)'

SplittedDataset = namedtuple('SplittedDataset', ['name', 'x_train', 'y_train', 'x_test', 'y_test'])
splited_datasets = []

for dataset_name, dataset in datasets.items():
    model = dict()
    model['name'] = dataset_name
    data_x = dataset.drop([target_feature_name], axis=1)
    data_y = dataset[target_feature_name]
    model['x_train'], model['x_test'], model['y_train'], model['y_test'] = train_test_split(data_x, data_y, test_size=test_size, random_state=seed)
    splited_datasets.append(SplittedDataset(model['name'], model['x_train'],  model['y_train'], model['x_test'], model['y_test']))

RandomForestRegressor

In [4]:
random_forest_greed_search_params = {
    'n_estimators': [i for i in range(501, 1002, 250)],
    'criterion' : ["squared_error", "friedman_mse"],
}

In [5]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    random_forest_model = RandomForestRegressor()
    grid_search = GridSearchCV(random_forest_model, random_forest_greed_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    random_forest_model = RandomForestRegressor(
        n_estimators=best_params.get('n_estimators'),
        criterion=best_params.get('criterion')
    )
    random_forest_model.fit(dataset.x_train, dataset.y_train)
    test_predict = random_forest_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9927085764628596  | 0.013865629291803025 | {'criterion': 'friedman_mse', 'n_estimators': 751}
linear_interpolation           |  0.9891716820162963  | 0.016770829744922066 | {'criterion': 'friedman_mse', 'n_estimators': 1001}
cubic_interpolation            |  0.9907306283572973  | 0.015689853655225728 | {'criterion': 'friedman_mse', 'n_estimators': 501}
quadratic_interpolation        |  0.992124233671234   | 0.014484178111930632 | {'criterion': 'friedman_mse', 'n_estimators': 501}
polynomial_5_interpolation     |  0.981381955926966   | 0.022759747568716633 | {'criterion': 'squared_error', 'n_estimators': 501}
polynomial_7_interpolation     |  0.9802373153895785  | 0.013625492485882486 | {'criterion': 'squared_error', 'n_estimators': 751}
polynomial_9_interpolation     |  0.9511117257833476  | 0.012795656617821264 | {'criterion': 'squ

SVR

In [6]:
SVR_greed_search_params = {
    'C': [i for i in np.linspace(0.1, 100, num=20)],
    'kernel': ['linear', 'poly'],
    'degree': [i for i in range(2, 20)],
    'gamma' : ['scale', 'auto']
}

In [7]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    svr_model = SVR()
    grid_search = GridSearchCV(svr_model, SVR_greed_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    best_svr_model = SVR(
        C=best_params.get('C'),
        kernel=best_params.get('kernel')
    )
    best_svr_model.fit(dataset.x_train, dataset.y_train)

    test_predict = best_svr_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.8805023350012436  | 0.05613233798100877  | {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
linear_interpolation           |  0.8673024882754615  | 0.05870910223827949  | {'C': 5.3578947368421055, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
cubic_interpolation            |  0.8783937624782114  | 0.056829216847103896 | {'C': 5.3578947368421055, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
quadratic_interpolation        |  0.8829528955563133  | 0.05583773364949231  | {'C': 5.3578947368421055, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
polynomial_5_interpolation     |  0.8832722083266353  | 0.05698854232875017  | {'C': 5.3578947368421055, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
polynomial_7_interpolation     |  0.7112227479249504  | 0.05208476731977888  | {'C': 0.1, 'degree': 4, 'gamma': 'scale', 'k

Linear regression

In [8]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    linear_model = LinearRegression()
    linear_model.fit(dataset.x_train, dataset.y_train)
    test_predict = linear_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {"-".center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9907301382096394  | 0.015633997977031588 |                    -                    
linear_interpolation           |  0.9936182915645295  | 0.012874860405056891 |                    -                    
cubic_interpolation            |  0.9933571470215885  | 0.013282232117914659 |                    -                    
quadratic_interpolation        |  0.9928675371474754  | 0.013783744197748153 |                    -                    
polynomial_5_interpolation     |  0.9904417993650888  | 0.01630754122810385  |                    -                    
polynomial_7_interpolation     |  0.975561579469896   | 0.015151854997984233 |                    -                    
polynomial_9_interpolation     |  0.989144554267107   | 0.006029538240341453 |                    -                    
polynomial_11_interpolation    |  0.9974

DecisionTreeRegressor

In [9]:
tree_greed_search_params = {
    'splitter': ["best", "random"],
    'criterion' : ["squared_error", "friedman_mse", "absolute_error", "poisson"],
}

In [10]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    tree_model = DecisionTreeRegressor()
    grid_search = GridSearchCV(tree_model, tree_greed_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    best_tree_model = DecisionTreeRegressor(
        splitter=best_params.get('splitter'),
        criterion=best_params.get('criterion')
    )
    best_tree_model.fit(dataset.x_train, dataset.y_train)
    test_predict = best_tree_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9757073522556675  | 0.02530875888086164  | {'criterion': 'poisson', 'splitter': 'best'}
linear_interpolation           |  0.9351957363164844  | 0.041027574231159734 | {'criterion': 'absolute_error', 'splitter': 'best'}
cubic_interpolation            |  0.9783995534258159  | 0.023951080493251734 | {'criterion': 'absolute_error', 'splitter': 'best'}
quadratic_interpolation        |  0.9748538354846877  | 0.02588113522106923  | {'criterion': 'absolute_error', 'splitter': 'best'}
polynomial_5_interpolation     |  0.8330751012617197  | 0.06814923336085207  | {'criterion': 'poisson', 'splitter': 'random'}
polynomial_7_interpolation     |  0.9249456231984227  | 0.026553223456380774 | {'criterion': 'friedman_mse', 'splitter': 'best'}
polynomial_9_interpolation     |  0.7301518064058589  | 0.030062146102732296 | {'criterion': 'squared_error'

In [12]:
best_models_per_dataset = {
    splited_datasets[0].name:  [
        {'model': SVR(C=0.1, degree=2, gamma='scale', kernel='linear'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion='poisson', splitter='best'), 'name': 'decision_tree'}
    ],

    splited_datasets[1].name:  [
        {'model': SVR(C=5.3578947368421055, degree=2, gamma='scale', kernel='poly'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion='absolute_error', splitter='best'), 'name': 'decision_tree'}
    ],

    splited_datasets[2].name:  [
        {'model': SVR(C=5.3578947368421055, degree=3, gamma='scale', kernel='poly'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion='absolute_error', splitter='best'), 'name': 'decision_tree'}
    ],

    splited_datasets[3].name:  [
        {'model': SVR(C=5.3578947368421055, degree=3, gamma='scale', kernel='poly'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion='absolute_error', splitter='best'), 'name': 'decision_tree'}
    ],

    splited_datasets[4].name:  [
        {'model': SVR(C=5.3578947368421055, degree=3, gamma='scale', kernel='poly'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion='poisson', splitter='random'), 'name': 'decision_tree'}
    ],

    splited_datasets[5].name:  [
        {'model': SVR(C=0.1, degree=4, gamma='scale', kernel='poly'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion="friedman_mse", splitter='best'), 'name': 'decision_tree'}
    ],

    splited_datasets[6].name:  [
        {'model': SVR(C=0.1, degree=2, gamma='scale', kernel='linear'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion='squared_error', splitter='random'), 'name': 'decision_tree'}
    ],

    splited_datasets[7].name:  [
        {'model': SVR(C=0.1, degree=2, gamma='scale', kernel='linear'), 'name': 'svr_model'},
        {'model': LinearRegression(), 'name': 'linear_model'},
        {'model': DecisionTreeRegressor(criterion='absolute_error', splitter='best'), 'name': 'decision_tree'}
    ]
}

Bagging

In [14]:
print(f'{"dataset_name".center(30)} | {"The model on which bagging is based".center(40)} | {"r^2".center(20)} | {"rmse".center(20)}')
for dataset in splited_datasets:
    models = best_models_per_dataset[dataset.name]
    for model in models:
        bagging_model = BaggingRegressor(model.get('model'), max_samples=200, n_estimators=300)
        bagging_model.fit(dataset.x_train, dataset.y_train)
        test_predict = bagging_model.predict(dataset.x_test)
        rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
        r2 = r2_score(dataset.y_test, test_predict)
        print(f'{str(dataset.name).ljust(30)} | {model.get("name").center(40)} | {str(r2).center(20)} | {str(rmse).center(20)}')

         dataset_name          |   The model on which bagging is based    |         r^2          |         rmse        
bfill_ffill                    |                svr_model                 |  0.8677539562228549  | 0.05905066139206543 
bfill_ffill                    |               linear_model               |  0.9882000947793678  | 0.01763894805953474 
bfill_ffill                    |              decision_tree               |  0.9933199941783071  | 0.013271555952638627
linear_interpolation           |                svr_model                 |  0.8607492392914218  | 0.060141303654964284
linear_interpolation           |               linear_model               |  0.993941476482994   | 0.012544618221731339
linear_interpolation           |              decision_tree               |  0.986705890559078   | 0.01858248002077375 
cubic_interpolation            |                svr_model                 |  0.8644326861524738  | 0.06000276184064881 
cubic_interpolation            |        

Boosting

GradientBoostingRegressor

In [15]:
gradient_boosting_search_params = {
    'loss': ['squared_error', 'absolute_error', 'huber'],
    'learning_rate' : [i for i in np.linspace(0.1, 2, num=5)],
    'n_estimators' : [100, 200, 300],
    'criterion': ['friedman_mse', 'squared_error']
}

In [16]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    model = GradientBoostingRegressor()
    grid_search = GridSearchCV(model, gradient_boosting_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    best_model = GradientBoostingRegressor(
        loss=best_params.get('loss'),
        learning_rate=best_params.get('learning_rate'),
        n_estimators=best_params.get('n_estimators'),
        criterion=best_params.get('criterion')
    )
    best_model.fit(dataset.x_train, dataset.y_train)
    test_predict = best_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9950358018360845  | 0.01144084170342771  | {'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'squared_error', 'n_estimators': 200}
linear_interpolation           |  0.9808573445216046  | 0.022298473908945026 | {'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'squared_error', 'n_estimators': 300}
cubic_interpolation            |  0.9913013595601345  | 0.015199154690839578 | {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'squared_error', 'n_estimators': 200}
quadratic_interpolation        |  0.9908228756602306  | 0.015635103249040744 | {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'squared_error', 'n_estimators': 100}
polynomial_5_interpolation     |  0.9911741542294307  | 0.01567034582488927  | {'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'squared_error', 'n_estimators

AdaBoostRegressor

In [19]:
gradient_boosting_search_params = {
    'loss': ['linear', 'square', 'exponential'],
    'learning_rate' : [i for i in np.linspace(0.1, 2, num=5)],
    'n_estimators' : [100, 200, 300],
}

In [18]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    ensemble_models = best_models_per_dataset[dataset.name]
    gradient_boosting_search_params['estimator'] = [ensemble_model['model'] for ensemble_model in ensemble_models]

    model = AdaBoostRegressor()
    grid_search = GridSearchCV(model, gradient_boosting_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    best_model = AdaBoostRegressor(
        loss=best_params.get('loss'),
        learning_rate=best_params.get('learning_rate'),
        n_estimators=best_params.get('n_estimators'),
        estimator=best_params.get('estimator')
    )
    best_model.fit(dataset.x_train, dataset.y_train)
    test_predict = best_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9883532696967754  |  0.017524088239988   | {'estimator': LinearRegression(), 'learning_rate': 2.0, 'loss': 'exponential', 'n_estimators': 100}
linear_interpolation           |  0.9943279088224458  | 0.012137958586833782 | {'estimator': LinearRegression(), 'learning_rate': 0.1, 'loss': 'linear', 'n_estimators': 100}
cubic_interpolation            |  0.9947564525259688  | 0.011800661353918081 | {'estimator': LinearRegression(), 'learning_rate': 0.1, 'loss': 'linear', 'n_estimators': 200}
quadratic_interpolation        |  0.9936775334792448  | 0.012977490352826216 | {'estimator': LinearRegression(), 'learning_rate': 0.1, 'loss': 'linear', 'n_estimators': 200}


6 fits failed out of a total of 675.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Daniil_Alenushkin\Desktop\SUAI\Magistracy_09.04.04\predicting_a_country_economic_potential\env\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Daniil_Alenushkin\Desktop\SUAI\Magistracy_09.04.04\predicting_a_country_economic_potential\env\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Daniil_Alenushkin\Desktop\SUAI\Magistracy_09.04.04\predicting_a_country_economic_potential\env\lib\si

polynomial_5_interpolation     |  0.9932893442167926  | 0.013664158631770376 | {'estimator': LinearRegression(), 'learning_rate': 0.575, 'loss': 'linear', 'n_estimators': 200}
polynomial_7_interpolation     |  0.9796548170957783  |  0.0138248375854265  | {'estimator': LinearRegression(), 'learning_rate': 1.525, 'loss': 'exponential', 'n_estimators': 300}
polynomial_9_interpolation     |  0.9956791498520575  | 0.0038040378187853326 | {'estimator': LinearRegression(), 'learning_rate': 2.0, 'loss': 'exponential', 'n_estimators': 200}
polynomial_11_interpolation    |  0.998481328691613   | 0.002345872992157921 | {'estimator': LinearRegression(), 'learning_rate': 2.0, 'loss': 'exponential', 'n_estimators': 100}
         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.990022173252303   | 0.016220020592671112 | {'estimator': LinearRegression(), 'learning_rate': 0.1, 'loss': 'square', 'n_estimato