### Library Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import statistics
import math
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, PowerTransformer, RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectPercentile, SelectFromModel, RFE, VarianceThreshold
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

### Functions

In [52]:
# summarizes the output of a cross-validated mse array
def MSE_summarizer(array):
    return math.sqrt(statistics.mean(abs(array)))

# Create a pandas dataframe with: modelName, modelParameters, ModelFeatures, Score, Scaler, Selector
def TableCreator(grids, model_names, feature_set, mae=False):
    modelParameters, modelFeatures, modelScores, modelScalers, modelSelectors = [], [], [], [], []
    for grid in grids:
        best_iteration = grid.best_estimator_
        names = list(grid.best_estimator_.named_steps.keys())
        modelParameters.append(grid.best_params_)
        modelFeatures.append(list(feature_set[best_iteration.named_steps[names[1]].get_support()]))
        modelScores.append(math.sqrt(abs(grid.best_score_)))
        modelScalers.append(names[0])
        modelSelectors.append(names[1])
        
    pd.set_option('display.max_colwidth', -1)
    if mae:
        return pd.DataFrame({"modelName":model_names, "modelParameter":modelParameters, "features": modelFeatures,
                        "MAE": modelScores, "Scaler": modelScalers, "Selector": modelSelectors})
    else:
        return pd.DataFrame({"modelName":model_names, "modelParameter":modelParameters, "features": modelFeatures,
                        "RMSE": modelScores, "Scaler": modelScalers, "Selector": modelSelectors})

# Function to extract model insights of the best estimator of each grid
def modelExtractor(model, grids):
    if model == 'lr':
        # Linear Regression
        names = list(grids[0].best_estimator_.named_steps.keys())
        best_iteration = grids[0].best_estimator_
        features = list(MeX_NF.columns[best_iteration.named_steps[names[1]].get_support()])
        coefficients = list(grids[0].best_estimator_.named_steps["lr"].coef_)
        return "The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients)
    
    elif model == "ridge":
        names = list(grids[1].best_estimator_.named_steps.keys())
        best_iteration = grids[1].best_estimator_
        features = list(MeX_NF.columns[best_iteration.named_steps[names[1]].get_support()])
        coefficients = list(grids[1].best_estimator_.named_steps["ridge"].coef_)
        return "The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients)

    elif model == "svm":
        names = list(grids[3].best_estimator_.named_steps.keys())
        best_iteration = grids[3].best_estimator_
        features = list(MeX_NF.columns[best_iteration.named_steps[names[1]].get_support()])
        coefficients = list(grids[3].best_estimator_.named_steps["svr"].coef_)
        return "The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients)
    
    elif model == "rf":
        names = list(grids[4].best_estimator_.named_steps.keys())
        best_iteration = grids[4].best_estimator_
        features = list(MeX_NF.columns[best_iteration.named_steps[names[1]].get_support()])
        coefficients = list(grids[4].best_estimator_.named_steps["rf"].feature_importances_)
        return "The features are:\n {} \n\n With importances: \n {}".format(features, coefficients)

### Data

**Data import**

In [4]:
path = r"C:\Users\BrechtDewilde\Documents\Github\Predicting-the-olympic-games\marathon\Data c&c\R\Data\output"
Me = pd.read_csv(path + "\marathonM_extended.csv")
Fe = pd.read_csv(path + "\marathonF_extended.csv")

**Subset & dummy coding**

In [5]:
amountFeatures = Me.shape[1]

# Feature division
numeric_features = ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'GDP',
       'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime',
       'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore',
                    'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime']

categoric_features = ['Nationality', 'SubRegion', 'continent', 'HomeAdvantage']

target_features = ["Position", "TimeMins", "TimeSecs"]

# X subsetting
MeX_NF = Me.loc[:, numeric_features]
MeX_CF = Me.loc[:, categoric_features]
Mey_F = Me.loc[:, target_features]

# Dummy coding
temp = pd.get_dummies(MeX_CF, columns = categoric_features)
MeX_dummy_CF = temp.rename(columns = {'continent_Africa':"Africa", 'continent_Asia':'Asia', 'continent_Europe':"Europe",
       'continent_Latin America and the Caribbean':"Latin America", 'continent_Northern America': "Northern America",
        'continent_Oceania':"Oceania", 'HomeAdvantage_False':"NO homeadvantage", 'HomeAdvantage_True':"Home advantage"})

### Model comparison

**Numeric features**

In [26]:
kf = KFold(n_splits=5, random_state=1234, shuffle=True)

models = [LinearRegression(), Ridge(), KNeighborsRegressor(), SVR(kernel="linear"), RandomForestRegressor()]
model_names = ["lr", "ridge", "knn", "svr", "rf"]

ridge_grid = {"ridge__alpha":[0.001, 0.01, 0.1, 1, 10, 100]}
knn_grid = {"knn__n_neighbors":[1,5,20,30,50,100,200]}
svr_grid = {"svr__C": [0.001, 0.01, 0.1, 1, 10, 100], "svr__gamma":[0.001, 0.01, 0.1, 1, 10, 100]}
RF_grid = {"rf__n_estimators":[100,200,300]}
param_grids = [{}, ridge_grid, knn_grid, svr_grid, RF_grid]

scalers = [MinMaxScaler(), StandardScaler(), RobustScaler(), PowerTransformer(), Normalizer(), Binarizer()] 
scaler_names = ["MinMax","standard", "robust", "power", "normalizer", "binarizer"]

selectors = [SelectPercentile(), SelectFromModel(RandomForestRegressor(random_state=1234)), VarianceThreshold()]
selectors_name = ["PercentileSelector", "RFE_RF_score", "varianceTH"]

In [28]:
import warnings
warnings.filterwarnings('ignore')

MAE_grids = []
grid_selector = 0

for model, model_name in zip(models, model_names):
    best_score = 1000000000000
    for selector, selector_name in zip(selectors, selectors_name):
        for scaler, scaler_name in zip(scalers, scaler_names):
                
            pipe = Pipeline([(scaler_name,scaler), (selector_name,selector),(model_name, model)])
            grid = GridSearchCV(pipe, param_grid=param_grids[grid_selector],cv=kf,scoring= "neg_mean_absolute_error")
            grid.fit(MeX_NF, Mey_F.loc[:,"TimeMins"])
            output = abs(grid.best_score_ )

            if output < best_score:
                best_score = output
                best_grid = grid

    MAE_grids.append(best_grid)
    grid_selector += 1
TableCreator(MAE_grids, model_names, MeX_NF.columns)

Unnamed: 0,modelName,modelParameter,features,RMSE,Scaler,Selector
0,lr,{},"[Age, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore]",2.178338,robust,RFE_RF_score
1,ridge,{'ridge__alpha': 0.001},"[Age, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore]",2.174662,MinMax,RFE_RF_score
2,knn,{'knn__n_neighbors': 30},"[Age, Weight, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",2.179857,robust,RFE_RF_Score
3,svr,"{'svr__C': 0.1, 'svr__gamma': 0.001}","[Age, Height, Weight, bmi, Year, Population, GDP, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",2.160111,standard,varianceTH
4,rf,{'rf__n_estimators': 100},"[Age, Weight, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",2.200763,standard,RFE_RF_Score


**Numeric features with log transformed target variable**

In [29]:
import warnings
warnings.filterwarnings('ignore')

y = np.log(Mey_F.loc[:,"TimeMins"])
MAE_log_grids = []
grid_selector = 0

for model, model_name in zip(models, model_names):
    best_score = 1000000000000
    for selector, selector_name in zip(selectors, selectors_name):
        for scaler, scaler_name in zip(scalers, scaler_names):
                
            pipe = Pipeline([(scaler_name,scaler), (selector_name,selector),(model_name, model)])
            grid = GridSearchCV(pipe, param_grid=param_grids[grid_selector],cv=kf,scoring= "neg_mean_absolute_error")
            grid.fit(MeX_NF, y)
            output = abs(grid.best_score_ )

            if output < best_score:
                best_score = output
                best_grid = grid

    MAE_log_grids.append(best_grid)
    grid_selector += 1
TableCreator(log_grids, model_names, MeX_NF.columns)

Unnamed: 0,modelName,modelParameter,features,RMSE,Scaler,Selector
0,lr,{},"[Age, Weight, bmi, Year, Population, countryWins, avgTimeScore, bestTimeScore, avgPosition, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",0.042332,power,RFE_RF_Score
1,ridge,{'ridge__alpha': 100},"[Age, Height, Weight, bmi, Year, Population, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",0.041841,power,varianceTH
2,knn,{'knn__n_neighbors': 30},"[Age, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore]",0.04268,robust,RFE_RF_score
3,svr,"{'svr__C': 0.001, 'svr__gamma': 0.001}","[Age, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore]",0.054135,robust,RFE_RF_score
4,rf,{'rf__n_estimators': 300},"[Age, Height, Weight, bmi, Year, Population, GDP, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",0.043549,robust,varianceTH


**Numeric features and categoric features** <br/> Continent and homeadvantage

In [30]:
import warnings
warnings.filterwarnings('ignore')

MAE_categoric_grids = []
grid_selector = 0

MEX_continent_homeAdvantage = MeX_dummy_CF.iloc[:, -8:]
full_data = pd.concat([MeX_NF, MEX_continent_homeAdvantage], axis=1)
categorical_columns = full_data.columns[-8:]

for model, model_name in zip(models, model_names):
    best_score = 1000000000000
    for selector, selector_name in zip(selectors, selectors_name):
        for scaler, scaler_name in zip(scalers, scaler_names):
            # pre-process PipeLines for numeric and categorical features
            numeric_transformer = Pipeline(steps = [(scaler_name, scaler)])
            categorical_transformer = Pipeline(steps = [("onehot", None)])
            preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                           ('cat', "passthrough", categorical_columns)])

            # GridSearch
            pipe = Pipeline(steps=[('preprocessor', preprocessor), (selector_name,selector), (model_name, model)])
            grid = GridSearchCV(pipe, param_grid=param_grids[grid_selector], cv=kf, scoring= "neg_mean_squared_error")
            grid.fit(full_data, Mey_F.loc[:,"TimeMins"])
            output = abs(grid.best_score_ )
            
            if output < best_score:
                best_score = output
                best_grid = grid
                
    MAE_categoric_grids.append(best_grid)
    grid_selector += 1
TableCreator(MAE_categoric_grids, model_names, full_data.columns)

Unnamed: 0,modelName,modelParameter,features,RMSE,Scaler,Selector
0,lr,{},"[Age, Weight, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",2.177477,preprocessor,RFE_RF_score
1,ridge,{'ridge__alpha': 100},"[Age, Height, Weight, bmi, Year, Population, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime, Africa, Asia, Europe, Latin America, Northern America, Oceania, NO homeadvantage, Home advantage]",2.17512,preprocessor,varianceTH
2,knn,{'knn__n_neighbors': 30},"[Age, Height, Weight, bmi, Year, Population, GDP, marathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",2.186278,preprocessor,RFE_RF_Score
3,svr,"{'svr__C': 1, 'svr__gamma': 0.001}","[Age, Height, Weight, bmi, Year, Population, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime, Africa, Asia, Europe, Latin America, Northern America, Oceania, NO homeadvantage, Home advantage]",2.158697,preprocessor,varianceTH
4,rf,{'rf__n_estimators': 300},"[Age, Height, Weight, bmi, Year, Population, GDP, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime, Africa, Asia, Europe, Latin America, Northern America, Oceania, NO homeadvantage, Home advantage]",2.207084,preprocessor,varianceTH


**Numeric and categoric features with log transformed target variable**

In [87]:
import warnings
warnings.filterwarnings('ignore')

categoric_log_grids = []
grid_selector = 0
y = np.log(Mey_F.loc[:,"TimeMins"])

MEX_continent_homeAdvantage = MeX_dummy_CF.iloc[:, -8:]
full_data = pd.concat([MeX_NF, MEX_continent_homeAdvantage], axis=1)
categorical_columns = full_data.columns[-8:]

for model, model_name in zip(models, model_names):
    best_score = 1000000000000
    for selector, selector_name in zip(selectors, selectors_name):
        for scaler, scaler_name in zip(scalers, scaler_names):
            # pre-process PipeLines for numeric and categorical features
            numeric_transformer = Pipeline(steps = [(scaler_name, scaler)])
            categorical_transformer = Pipeline(steps = [("onehot", None)])
            preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                           ('cat', "passthrough", categorical_columns)])

            # GridSearch
            pipe = Pipeline(steps=[('preprocessor', preprocessor), (selector_name,selector), (model_name, model)])
            grid = GridSearchCV(pipe, param_grid=param_grids[grid_selector], cv=kf, scoring= "neg_mean_squared_error")
            grid.fit(full_data, y)
            output = abs(grid.best_score_ )
            
            if output < best_score:
                best_score = output
                best_grid = grid
                
    categoric_log_grids.append(best_grid)
    grid_selector += 1
TableCreator(categoric_log_grids, model_names, full_data.columns)

Unnamed: 0,modelName,modelParameter,features,RMSE,Scaler,Selector
0,lr,{},"[Age, Weight, bmi, Year, Population, avgTimeScore, bestTimeScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",0.042598,preprocessor,RFE_RF_score
1,ridge,{'ridge__alpha': 100},"[Age, Height, Weight, bmi, Year, Population, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime, Africa, Asia, Europe, Latin America, Northern America, Oceania, NO homeadvantage, Home advantage]",0.041699,preprocessor,varianceTH
2,knn,{'knn__n_neighbors': 30},"[Age, Height, Weight, bmi, Year, Population, GDP, marathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",0.042362,preprocessor,RFE_RF_Score
3,svr,"{'svr__C': 0.001, 'svr__gamma': 0.001}","[avgTime, bestTime, avgTimeScore, Oceania]",0.054187,preprocessor,PercentileSelector
4,rf,{'rf__n_estimators': 200},"[Age, Height, Weight, bmi, Year, Population, GDP, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime, Africa, Asia, Europe, Latin America, Northern America, Oceania, NO homeadvantage, Home advantage]",0.043624,preprocessor,varianceTH


In [90]:
import warnings
warnings.filterwarnings('ignore')

MAE_categoric_log_grids = []
grid_selector = 0
y = np.log(Mey_F.loc[:,"TimeMins"])

MEX_continent_homeAdvantage = MeX_dummy_CF.iloc[:, -8:]
full_data = pd.concat([MeX_NF, MEX_continent_homeAdvantage], axis=1)
categorical_columns = full_data.columns[-8:]

for model, model_name in zip(models, model_names):
    best_score = 1000000000000
    for selector, selector_name in zip(selectors, selectors_name):
        for scaler, scaler_name in zip(scalers, scaler_names):
            # pre-process PipeLines for numeric and categorical features
            numeric_transformer = Pipeline(steps = [(scaler_name, scaler)])
            categorical_transformer = Pipeline(steps = [("onehot", None)])
            preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                           ('cat', "passthrough", categorical_columns)])

            # GridSearch
            pipe = Pipeline(steps=[('preprocessor', preprocessor), (selector_name,selector), (model_name, model)])
            grid = GridSearchCV(pipe, param_grid=param_grids[grid_selector], cv=kf, scoring= "neg_mean_absolute_error")
            grid.fit(full_data, y)
            output = abs(grid.best_score_ )
            
            if output < best_score:
                best_score = output
                best_grid = grid
                
    MAE_categoric_log_grids.append(best_grid)
    grid_selector += 1
TableCreator(MAE_categoric_log_grids, model_names, full_data.columns, True)

Unnamed: 0,modelName,modelParameter,features,MAE,Scaler,Selector
0,lr,{},"[Age, Weight, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPerformanceScore, monthsBestTime]",0.181487,preprocessor,RFE_RF_score
1,ridge,{'ridge__alpha': 100},"[Age, Height, Weight, bmi, Year, Population, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime, Africa, Asia, Europe, Latin America, Northern America, Oceania, NO homeadvantage, Home advantage]",0.181286,preprocessor,varianceTH
2,knn,{'knn__n_neighbors': 30},"[Age, Height, Weight, bmi, Year, Population, GDP, marathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",0.182216,preprocessor,RFE_RF_Score
3,svr,"{'svr__C': 0.001, 'svr__gamma': 0.001}","[avgTime, bestTime, avgTimeScore, Oceania]",0.21637,preprocessor,PercentileSelector
4,rf,{'rf__n_estimators': 200},"[Age, Height, Weight, bmi, Year, Population, GDP, countryWins, subRegionWins, marathons, Gmarathons, Smarathons, Bmarathons, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPosition, bestPosition, avgPositionScore, bestPositionScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime, Africa, Asia, Europe, Latin America, Northern America, Oceania, NO homeadvantage, Home advantage]",0.184066,preprocessor,varianceTH


### Model Insights

**Coefficients and intercept of the Linear regression**

In [10]:
print(modelExtractor("lr", grids))

The features are:
 ['Age', 'Weight', 'bmi', 'Population', 'avgTimeScore', 'bestTimeScore', 'avgPerformanceScore'] 

 With coefficients: 
 [0.32578712337853544, -0.5062895122670763, 0.8132165684096506, -1.1060797110429672, -3.6549292882239404, -1.5327230778970486, 1.0621887518904616]


In [11]:
print(modelExtractor("lr", log_grids))

The features are:
 ['Age', 'Weight', 'bmi', 'Year', 'Population', 'countryWins', 'avgTimeScore', 'bestTimeScore', 'avgPosition', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime'] 

 With coefficients: 
 [0.002245084194894853, -0.00471612144089575, 0.0054525361703397866, -0.0012263292057315513, -0.0067098905709428456, -0.0060789857051904836, -0.0062574743088226, 0.0074597508307014835, -0.012970758363556873, -0.019492754341703653, -0.015227148982297899, 0.002518207120368468]


In [91]:
# Categoric
cg = categoric_log_grids[0]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["lr"].coef_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

The features are:
 ['Age', 'Weight', 'bmi', 'Year', 'Population', 'avgTimeScore', 'bestTimeScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime'] 

 With coefficients: 
 [0.002256911184350107, -0.003489999095509746, 0.005574748725250358, 0.000947768077274942, -0.0075145055291419354, -0.03532128485784231, 0.005499793380291795, 0.01833903215588408, -0.018123150706539495, 0.0017285566133097862]


In [69]:
# Categoric
cg = categoric_grids[0]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["lr"].coef_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

**Coefficients and intercept of the Ridge regression**

In [13]:
print(modelExtractor("ridge", grids))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'avgTimeScore', 'bestTimeScore', 'avgPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime'] 

 With coefficients: 
 [0.24952138167885415, -0.41351940205957943, -0.0167744303835423, 0.4164076834395507, 0.053047057203550935, -0.983322191228252, -1.386774615139222, -0.9931764429755323, 0.7114615340383571, -0.9878637507212711, -1.0097819429581583, 0.20945271455729803]


In [14]:
print(modelExtractor("ridge", log_grids))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime'] 

 With coefficients: 
 [0.001794817849450127, -0.002755565469214032, -0.0002815216829743332, 0.0025633615338008908, -0.0007797692582232402, -0.005717633217489188, -0.0045066884273503695, -0.0018865383275021947, -0.0023743334530421695, 0.0011686585776160276, 0.0006402341471002008, 0.0028856111632202083, 0.005728197658243275, 0.004224321725273834, -0.005840437382984729, -0.0027128269605560104, -0.00253258612537226, -0.00023591478655244898, 0.001971573580852371, 0.0003711357929315318, -0.0051273461332068514, -0.004751715672843222, 0.0015446328981850684]


In [72]:
# Categoric
cg = categoric_grids[1]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["ridge"].coef_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime', 'Africa', 'Asia', 'Europe', 'Latin America', 'Northern America', 'Oceania', 'NO homeadvantage', 'Home advantage'] 

 With coefficients: 
 [0.2647367949740957, -0.33385204681604996, 0.004031341022590188, 0.35309608761882283, -0.12312227608138097, -0.9353863696409553, -0.6136854782051676, -0.19467762733084273, -0.3455370826926318, 0.190743415319644, 0.042077436505928344, 0.397066656137098, 0.8224795778056704, 0.6474979279686708, -0.8000299883129451, -0.38291303337885507, -0.36730750060203327, -0.03946621842068503, 0.2829209130787791, 0.05749480694941483, -0.6948769034407449, -0.6616447600947495, 0.2242122061832858, -0.1855815574787667, 0.

In [93]:
cg = categoric_log_grids[1]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["ridge"].coef_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime', 'Africa', 'Asia', 'Europe', 'Latin America', 'Northern America', 'Oceania', 'NO homeadvantage', 'Home advantage'] 

 With coefficients: 
 [0.0019460246157064981, -0.0022294327452238773, 1.6480775384949476e-05, 0.0023745467552013545, -0.0009864494596029281, -0.006273515352390775, -0.0045544456734816045, -0.0013071823690244382, -0.0025017529585050083, 0.0013971508342219875, 0.00026424450204833825, 0.002884443221769774, 0.005653735508671901, 0.004235426540439217, -0.005708826948673002, -0.002729901668561018, -0.0024165534762681166, -0.0002967628253925889, 0.0019183182446849223, 0.0004300468649860906, -0.004981852736474426, -0.0046160149944

**Coefficients of the SVM-regressor**

In [16]:
print(modelExtractor("svm", grids))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'avgTimeScore', 'bestTimeScore', 'avgPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime'] 

 With coefficients: 
 [array([ 0.32549039, -0.244238  , -0.10853989,  0.14613133,  0.02005745,
       -0.88222621, -1.37078808, -0.92050198,  0.47445736, -1.0095212 ,
       -0.98603977, -0.02855429])]


In [17]:
print(modelExtractor("svm", log_grids))

The features are:
 ['Age', 'bmi', 'Population', 'GDP', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore'] 

 With coefficients: 
 [array([ 0.00152703,  0.0027892 , -0.00199421,  0.00050906,  0.00364417,
        0.00326675, -0.0045798 , -0.00427982])]


In [80]:
cg = categoric_grids[3]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["svr"].coef_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime', 'Africa', 'Asia', 'Europe', 'Latin America', 'Northern America', 'Oceania', 'NO homeadvantage', 'Home advantage'] 

 With coefficients: 
 [array([ 0.65563282,  0.24282226, -0.00933568,  0.05845766, -0.13933201,
       -1.03492143, -0.57924365, -0.01877973, -1.38243368,  0.79681817,
       -0.23963916,  0.72574001,  1.8353584 ,  2.13949889, -0.76621735,
        2.12315154, -1.11663538,  0.36269954, -0.33583602,  0.50012102,
       -1.18075792, -0.504496  ,  0.24840878,  0.32018463,  1.58937267,
       -0.608027  ,  0.90448368, -0.82031965, -1.38569434,  0.83525039,
       -0.83525039])]


In [94]:
cg = categoric_log_grids[3]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["svr"].coef_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

The features are:
 ['avgTime', 'bestTime', 'avgTimeScore', 'Oceania'] 

 With coefficients: 
 [array([ 0.00636101,  0.0051177 , -0.00709445,  0.        ])]


**Feature Importance of the RF**

In [19]:
print(modelExtractor("rf", grids))

The features are:
 ['Age', 'Weight', 'bmi', 'Population', 'GDP', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime'] 

 With importances: 
 [0.0578544594392983, 0.03953347927347259, 0.060682866291493116, 0.07778564575461953, 0.0747109467130845, 0.10782560105069294, 0.18822793140032243, 0.11402366828828342, 0.11838991341084838, 0.06452616199041376, 0.046635992314861346, 0.049803334072609534]


In [20]:
print(modelExtractor("rf", log_grids))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'GDP', 'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime'] 

 With importances: 
 [0.035490888400011626, 0.028467109764621572, 0.02719842191933838, 0.04030158042411059, 0.027606076866852045, 0.05101184960214191, 0.05410019710948614, 0.020614404268244135, 0.012345978300299754, 0.02451773791626808, 0.012817581836952087, 0.01293545803910774, 0.005691315504323929, 0.11426767477822192, 0.16016709467674373, 0.10297903074777398, 0.10120748898853212, 0.02515196738426797, 0.00286605111350028, 0.019584152209304158, 0.002959175904068853, 0.04040832427580774, 0.037503298098964584, 0.03980714187105664]


In [95]:
cg = categoric_log_grids[4]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["rf"].feature_importances_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

The features are:
 ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'GDP', 'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime', 'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore', 'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime', 'Africa', 'Asia', 'Europe', 'Latin America', 'Northern America', 'Oceania', 'NO homeadvantage', 'Home advantage'] 

 With coefficients: 
 [0.037093363751849545, 0.02833758823288325, 0.025003800559943494, 0.038786988904410416, 0.02874874836203259, 0.05047876830439417, 0.05074615238986329, 0.021840891959146642, 0.010830392014492882, 0.02399673139156584, 0.011348963439921162, 0.015377340004620498, 0.0059406830451219726, 0.10544594577009377, 0.18854836735606273, 0.09980624994025597, 0.08361506649906912, 0.019120080368367823, 0.0028163525592127575, 0.018337055481453653, 0.0026553599951350463, 0.03975562126257547, 0.035910467239616266,

In [None]:
cg = categoric_grids[4]
names = list(cg.best_estimator_.named_steps.keys())
best_iteration = cg.best_estimator_
features = list(full_data.columns[best_iteration.named_steps[names[1]].get_support()])
coefficients = list(cg.best_estimator_.named_steps["rf"].feature_importances_)
print("The features are:\n {} \n\n With coefficients: \n {}".format(features, coefficients))

### Grids and tables

In [55]:
RMSE_grids = [grids, log_grids, categoric_grids]
MAE_gridss = [MAE_grids, MAE_log_grids, MAE_categoric_grids]

In [25]:
#rmse tables
grids_table = TableCreator(grids, model_names, MeX_NF.columns)
log_table = TableCreator(log_grids, model_names, MeX_NF.columns)
categoric_table = TableCreator(categoric_grids, model_names, full_data.columns)

RMSE_tables = [grids_table, log_table, categoric_table]

In [56]:
#MAE tables
MAE_grids_table = TableCreator(MAE_grids, model_names, MeX_NF.columns, True)
MAE_log_table = TableCreator(MAE_log_grids, model_names, MeX_NF.columns, True)
MAE_categoric_table = TableCreator(MAE_categoric_grids, model_names, full_data.columns, True)