In [1]:
from collections import namedtuple
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

Load all datasets

In [2]:
base_path = Path('dataset/dataset_versions')

datasets = {}
datasets_names = (
    'bfill_ffill',
    'linear_interpolation',
    'cubic_interpolation',
    'quadratic_interpolation',
    'polynomial_5_interpolation',
    'polynomial_7_interpolation',
    'polynomial_9_interpolation',
    'polynomial_11_interpolation',
)
for dataset_name in datasets_names:
    dataset = pd.read_excel(base_path / f'{dataset_name}_rescaled_dataset.xlsx')
    datasets[dataset_name] = dataset.iloc[:, 1:]

Division into training and test samples

In [3]:
test_size = 0.2
seed = 7
target_feature_name = 'GDP per capita (current US$)'

SplittedDataset = namedtuple('SplittedDataset', ['name', 'x_train', 'y_train', 'x_test', 'y_test'])
splited_datasets = []

for dataset_name, dataset in datasets.items():
    model = dict()
    model['name'] = dataset_name
    data_x = dataset.drop([target_feature_name], axis=1)
    data_y = dataset[target_feature_name]
    model['x_train'], model['x_test'], model['y_train'], model['y_test'] = train_test_split(data_x, data_y, test_size=test_size, random_state=seed)
    splited_datasets.append(SplittedDataset(model['name'], model['x_train'],  model['y_train'], model['x_test'], model['y_test']))

RandomForestRegressor

In [4]:
random_forest_greed_search_params = {
    'n_estimators': [i for i in range(501, 1002, 250)],
    'criterion' : ["squared_error", "friedman_mse"],
}

In [5]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    random_forest_model = RandomForestRegressor()
    grid_search = GridSearchCV(random_forest_model, random_forest_greed_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    random_forest_model = RandomForestRegressor(
        n_estimators=best_params.get('n_estimators'),
        criterion=best_params.get('criterion')
    )
    random_forest_model.fit(dataset.x_train, dataset.y_train)
    test_predict = random_forest_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9927085764628596  | 0.013865629291803025 | {'criterion': 'friedman_mse', 'n_estimators': 751}
linear_interpolation           |  0.9891716820162963  | 0.016770829744922066 | {'criterion': 'friedman_mse', 'n_estimators': 1001}
cubic_interpolation            |  0.9907306283572973  | 0.015689853655225728 | {'criterion': 'friedman_mse', 'n_estimators': 501}
quadratic_interpolation        |  0.992124233671234   | 0.014484178111930632 | {'criterion': 'friedman_mse', 'n_estimators': 501}
polynomial_5_interpolation     |  0.981381955926966   | 0.022759747568716633 | {'criterion': 'squared_error', 'n_estimators': 501}
polynomial_7_interpolation     |  0.9802373153895785  | 0.013625492485882486 | {'criterion': 'squared_error', 'n_estimators': 751}
polynomial_9_interpolation     |  0.9511117257833476  | 0.012795656617821264 | {'criterion': 'squ

SVR

In [6]:
SVR_greed_search_params = {
    'C': [i for i in np.linspace(0.1, 100, num=20)],
    'kernel': ['linear', 'poly'],
    'degree': [i for i in range(2, 20)],
    'gamma' : ['scale', 'auto']
}

In [7]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    svr_model = SVR()
    grid_search = GridSearchCV(svr_model, SVR_greed_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    best_svr_model = SVR(
        C=best_params.get('C'),
        kernel=best_params.get('kernel')
    )
    best_svr_model.fit(dataset.x_train, dataset.y_train)

    test_predict = best_svr_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.8805023350012436  | 0.05613233798100877  | {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
linear_interpolation           |  0.8673024882754615  | 0.05870910223827949  | {'C': 5.3578947368421055, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
cubic_interpolation            |  0.8783937624782114  | 0.056829216847103896 | {'C': 5.3578947368421055, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
quadratic_interpolation        |  0.8829528955563133  | 0.05583773364949231  | {'C': 5.3578947368421055, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
polynomial_5_interpolation     |  0.8832722083266353  | 0.05698854232875017  | {'C': 5.3578947368421055, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
polynomial_7_interpolation     |  0.7112227479249504  | 0.05208476731977888  | {'C': 0.1, 'degree': 4, 'gamma': 'scale', 'k

Linear regression

In [8]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    linear_model = LinearRegression()
    linear_model.fit(dataset.x_train, dataset.y_train)
    test_predict = linear_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {"-".center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9907301382096394  | 0.015633997977031588 |                    -                    
linear_interpolation           |  0.9936182915645295  | 0.012874860405056891 |                    -                    
cubic_interpolation            |  0.9933571470215885  | 0.013282232117914659 |                    -                    
quadratic_interpolation        |  0.9928675371474754  | 0.013783744197748153 |                    -                    
polynomial_5_interpolation     |  0.9904417993650888  | 0.01630754122810385  |                    -                    
polynomial_7_interpolation     |  0.975561579469896   | 0.015151854997984233 |                    -                    
polynomial_9_interpolation     |  0.989144554267107   | 0.006029538240341453 |                    -                    
polynomial_11_interpolation    |  0.9974

DecisionTreeRegressor

In [9]:
tree_greed_search_params = {
    'splitter': ["best", "random"],
    'criterion' : ["squared_error", "friedman_mse", "absolute_error", "poisson"],
}

In [10]:
print(f'{"dataset_name".center(30)} | {"r^2".center(20)} | {"rmse".center(20)} | {"best_params".center(40)}')
for dataset in splited_datasets:
    tree_model = DecisionTreeRegressor()
    grid_search = GridSearchCV(tree_model, tree_greed_search_params, n_jobs=7)
    grid_search.fit(dataset.x_train, dataset.y_train)
    best_params = grid_search.best_params_
    best_tree_model = DecisionTreeRegressor(
        splitter=best_params.get('splitter'),
        criterion=best_params.get('criterion')
    )
    best_tree_model.fit(dataset.x_train, dataset.y_train)
    test_predict = best_tree_model.predict(dataset.x_test)
    rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
    r2 = r2_score(dataset.y_test, test_predict)
    print(f'{str(dataset.name).ljust(30)} | {str(r2).center(20)} | {str(rmse).center(20)} | {str(best_params).center(40)}')

         dataset_name          |         r^2          |         rmse         |               best_params               
bfill_ffill                    |  0.9757073522556675  | 0.02530875888086164  | {'criterion': 'poisson', 'splitter': 'best'}
linear_interpolation           |  0.9351957363164844  | 0.041027574231159734 | {'criterion': 'absolute_error', 'splitter': 'best'}
cubic_interpolation            |  0.9783995534258159  | 0.023951080493251734 | {'criterion': 'absolute_error', 'splitter': 'best'}
quadratic_interpolation        |  0.9748538354846877  | 0.02588113522106923  | {'criterion': 'absolute_error', 'splitter': 'best'}
polynomial_5_interpolation     |  0.8330751012617197  | 0.06814923336085207  | {'criterion': 'poisson', 'splitter': 'random'}
polynomial_7_interpolation     |  0.9249456231984227  | 0.026553223456380774 | {'criterion': 'friedman_mse', 'splitter': 'best'}
polynomial_9_interpolation     |  0.7301518064058589  | 0.030062146102732296 | {'criterion': 'squared_error'