### Library import

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import statistics
import math
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, PowerTransformer, RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile, SelectFromModel, RFE
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

### Functions

In [3]:
# summarizes the output of a cross-validated mse array
def MSE_summarizer(array):
    return math.sqrt(statistics.mean(abs(array)))

###  Data 

**Import the data**

In [4]:
path = r"C:\Users\BrechtDewilde\Documents\Github\Predicting-the-olympic-games\marathon\Data c&c\R\Data\output"
Me = pd.read_csv(path + "\marathonM_extended.csv")
Fe = pd.read_csv(path + "\marathonF_extended.csv")

**Subsetting the features and dummy coding the categoric features**

In [5]:
amountFeatures = Me.shape[1]

# Feature division
numeric_features = ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'GDP',
       'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime',
       'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore', 'avgPerformanceScore',
       'bestPerformanceScore', 'monthsBestTime']

categoric_features = ['Nationality', 'SubRegion', 'continent', 'HomeAdvantage']

target_features = ["Position", "TimeMins", "TimeSecs"]

# X subsetting
MeX_NF = Me.loc[:, numeric_features]
MeX_CF = Me.loc[:, categoric_features]
Mey_F = Me.loc[:, target_features]

# Dummy coding
temp = pd.get_dummies(MeX_CF, columns = categoric_features)
MeX_dummy_CF = temp.rename(columns = {'continent_Africa':"Africa", 'continent_Asia':'Asia', 'continent_Europe':"Europe",
       'continent_Latin America and the Caribbean':"Latin America", 'continent_Northern America': "Northern America",
        'continent_Oceania':"Oceania", 'HomeAdvantage_False':"NO homeadvantage", 'HomeAdvantage_True':"Home advantage"})

### Model comparison

In [6]:
models = [LinearRegression(), KNeighborsRegressor(), RandomForestRegressor()]
kf = KFold(n_splits=5, random_state=1234, shuffle=True)

**Only numeric features** <br/> No pre-preprocessing/FeatureEngineering/Parameter tuning

In [None]:
lr = MSE_summarizer(cross_val_score(LinearRegression(), MeX_NF, Mey_F.loc[:,"TimeMins"], cv=kf, scoring = "neg_mean_squared_error"))
knn = MSE_summarizer(cross_val_score(KNeighborsRegressor(), MeX_NF, Mey_F.loc[:,"TimeMins"], cv=kf, scoring = "neg_mean_squared_error"))
rf = MSE_summarizer(cross_val_score(RandomForestRegressor(), MeX_NF, Mey_F.loc[:,"TimeMins"], cv=kf, scoring = "neg_mean_squared_error"))
results = pd.DataFrame({"Model":["LR", "KNN", "RF"], "score": [lr, knn, rf]})
results

**pre-processing**

In [15]:
scalers = [MinMaxScaler(), StandardScaler(), RobustScaler(), PowerTransformer(), Normalizer(), Binarizer()] 
scaler_names = ["MinMax","standard", "robust", "power", "normalizer", "binarizer"]

In [None]:
best_scores = []
best_scalers = []

for model in models:
    best_score = 1000000000000
    best_scaler = ""
    for scaler, scaler_name in zip(scalers, scaler_names):
        pipe = make_pipeline(scaler, model)
        output = MSE_summarizer(cross_val_score(pipe, MeX_NF, Mey_F.loc[:,"TimeMins"], cv=kf,
                                                scoring= "neg_mean_squared_error"))
        if output < best_score:
            best_score = output
            best_scaler = scaler_name
    best_scores.append(best_score)
    best_scalers.append(best_scaler)
results = pd.DataFrame({"Model":["LR", "KNN", "RF"], "Score": best_scores, "Scaler": best_scalers})
results

**FeatureEngineering** <br/> Polynomial 

In [None]:
best_scores = []
best_scalers = []
for model in models:
    best_score = 1000000000000
    best_scaler = ""
    for scaler, scaler_name in zip(scalers, scaler_names):
        pipe = make_pipeline(scaler, PolynomialFeatures(), model)
        output = MSE_summarizer(cross_val_score(pipe, MeX_NF, Mey_F.loc[:,"TimeMins"], 
                                                cv=kf, scoring= "neg_mean_squared_error"))
        if output < best_score:
            best_score = output
            best_scaler = scaler_name
    best_scores.append(best_score)
    best_scalers.append(best_scaler)
results = pd.DataFrame({"Model":["LR", "KNN", "RF"], "Score": best_scores, "Scaler": best_scalers})
results  

Interaction Terms

In [None]:
best_scores = []
best_scalers = []
for model in models:
    best_score = 1000000000000
    best_scaler = ""
    for scaler, scaler_name in zip(scalers, scaler_names):
        pipe = make_pipeline(scaler, PolynomialFeatures(interaction_only=True,include_bias = False), model)
        output = MSE_summarizer(cross_val_score(pipe, MeX_NF, Mey_F.loc[:,"TimeMins"], 
                                                cv=kf, scoring= "neg_mean_squared_error"))
        if output < best_score:
            best_score = output
            best_scaler = scaler_name
    best_scores.append(best_score)
    best_scalers.append(best_scaler)
results = pd.DataFrame({"Model":["LR", "KNN", "RF"], "Score": best_scores, "Scaler": best_scalers})
results  

**ParameterTuning**

In [8]:
knn_grid = {"knn__n_neighbors":[1,5,20,30,50,100,200]} 
RF_grid = {"rf__n_estimators":[100,200,300]}
param_grids = [None, knn_grid, RF_grid]
model_names = ["lr", "knn", "rf"]

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
best_scores = []
best_scalers = []
best_parameters = []
grid_selector = 0

for model, model_name in zip(models, model_names):
    best_score = 1000000000000
    
    for scaler, scaler_name in zip(scalers, scaler_names):
        if grid_selector == 0:
            pipe = make_pipeline(scaler, model)
            output = MSE_summarizer(cross_val_score(pipe, MeX_NF, Mey_F.loc[:,"TimeMins"], cv=kf, 
                                                    scoring= "neg_mean_squared_error"))
            if output < best_score:
                best_score = output
                best_scaler = scaler_name
                best_param = ""
                
        else:
            pipe = Pipeline([(scaler_name,scaler), (model_name, model)])
            grid = GridSearchCV(pipe, param_grid=param_grids[grid_selector],  cv=kf, 
                                scoring= "neg_mean_squared_error")
            grid.fit(MeX_NF, Mey_F.loc[:,"TimeMins"])
            output = math.sqrt(abs(grid.best_score_ ))
            if output < best_score:
                best_score = output
                best_scaler = scaler_name
                best_param = grid.best_params_
    
    best_scores.append(best_score)
    best_scalers.append(best_scaler)
    best_parameters.append(best_param)
    
    grid_selector += 1
    
results = pd.DataFrame({"Model":["LR", "KNN", "RF"], "score": best_scores, "Scaler": best_scalers, "params":best_parameters})
results  

**Feature Selection** <br/> °FeatureSelection: SelectPercentile <br/> °ModelSelection:RandomForestClassifier <br/> °IterativeFeatureSelection: RFE

In [None]:
# linear regression -- misschien gebruik maken van de fit_params attribute?
# We zullen eerst de scaler en feature selection uitvoeren om zo de features te weten
# dan pipeline om uitkomst te weten als die uitkomst beter is dan geven we die features mee voor de finale uitkomst

In [10]:
selectors = [SelectPercentile(), SelectFromModel(RandomForestRegressor(random_state=1234)), 
             RFE(RandomForestRegressor(random_state=1234))]
selectors_name = ["PercentileSelector", "RFE_RF_score", "RFE_RF_Score"]

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
best_scores = []
best_scalers = []
best_parameters = []
selector_algorit = []
selected_features = []
grid_selector = 0

for model, model_name in zip(models, model_names):
    best_score = 1000000000000
    for selector, selector_name in zip(selectors, selectors_name):
        for scaler, scaler_name in zip(scalers, scaler_names):
            # obtain estimate of the selected columns
            sc = selector.fit_transform(scaler.fit_transform(MeX_NF),  Mey_F.loc[:,"TimeMins"])
            sf = list(MeX_NF.columns[selector.get_support()])
            
            if grid_selector == 0:
                pipe = make_pipeline(scaler, selector, model)
                output = MSE_summarizer(cross_val_score(pipe, MeX_NF, Mey_F.loc[:,"TimeMins"], cv=kf,
                                                        scoring= "neg_mean_squared_error"))
                if output < best_score:
                    best_score = output
                    best_scaler = scaler_name
                    selector_alg = selector_name
                    selected_f = sf
                    best_param = ""
                    

            else:
                pipe = Pipeline([(scaler_name,scaler),(selector_name, selector),(model_name, model)])
                grid = GridSearchCV(pipe, param_grid=param_grids[grid_selector],  cv=kf, 
                                    scoring= "neg_mean_squared_error")
                grid.fit(MeX_NF, Mey_F.loc[:,"TimeMins"])
                output = math.sqrt(abs(grid.best_score_ ))
                if output < best_score:
                    best_score = output
                    best_scaler = scaler_name
                    selector_alg = selector_name
                    selected_f = sf
                    best_param = grid.best_params_
    
    best_scores.append(best_score)
    best_scalers.append(best_scaler)
    best_parameters.append(best_param)
    selected_features.append(selected_f)
    selector_algorit.append(selector_alg)
    
    grid_selector += 1
    
results = pd.DataFrame({"Model":["LR", "KNN", "RF"], "score": best_scores, "Scaler": best_scalers, 
                        "Selector": selector_algorit, "features":selected_features, "params": best_parameters})
results  

Unnamed: 0,Model,score,Scaler,Selector,features,params
0,LR,6.136214,power,RFE_RF_score,"[Age, Weight, bmi, Population, avgTimeScore, b...",
1,KNN,6.163878,robust,RFE_RF_Score,"[Age, Weight, bmi, Population, GDP, avgTime, b...",{'knn__n_neighbors': 30}
2,RF,6.353966,robust,RFE_RF_Score,"[Age, Weight, bmi, Population, GDP, avgTime, b...",{'rf__n_estimators': 200}


In [19]:
pd.set_option('display.max_colwidth', -1)
results

Unnamed: 0,Model,score,Scaler,Selector,features,params
0,LR,6.136214,power,RFE_RF_score,"[Age, Weight, bmi, Population, avgTimeScore, bestTimeScore, avgPerformanceScore]",
1,KNN,6.163878,robust,RFE_RF_Score,"[Age, Weight, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",{'knn__n_neighbors': 30}
2,RF,6.353966,robust,RFE_RF_Score,"[Age, Weight, bmi, Population, GDP, avgTime, bestTime, avgTimeScore, bestTimeScore, avgPerformanceScore, bestPerformanceScore, monthsBestTime]",{'rf__n_estimators': 200}
