### Library import

In [1]:
import pandas as pd
import numpy as np
import statistics
import math
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, PowerTransformer, RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectPercentile, SelectFromModel, RFE, VarianceThreshold
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

### Data 

**Data import**

In [2]:
path = r"C:\Users\BrechtDewilde\Documents\Github\Predicting-the-olympic-games\marathon\Data c&c\R\Data\output"
Me = pd.read_csv(path + "\marathonM_extended.csv")
Fe = pd.read_csv(path + "\marathonF_extended.csv")

**Data subsetting**

In [3]:
amountFeatures = Me.shape[1]

# Feature division
numeric_features = ['Age', 'Height', 'Weight', 'bmi', 'Year', 'Population', 'GDP',
       'countryWins', 'subRegionWins', 'marathons', 'Gmarathons', 'Smarathons', 'Bmarathons', 'avgTime', 'bestTime',
       'avgTimeScore', 'bestTimeScore', 'avgPosition', 'bestPosition', 'avgPositionScore', 'bestPositionScore',
                    'avgPerformanceScore', 'bestPerformanceScore', 'monthsBestTime']

categoric_features = ['Nationality', 'SubRegion', 'continent', 'HomeAdvantage']

target_features = ["Position", "TimeMins", "TimeSecs"]

### Functions

In [4]:
# Make time predictions for the 2016 dataframe 
# output is a dictionary, given index (lr, ...) right predictions can be obtained
# Use the predicted times to assign positions to each time
def time2position(timeList):
    test_list = timeList
    temp = sorted(timeList)     
    res = [temp.index(i) + 1 for i in test_list]
    return res

def positionComparizer(predicted_positions, actual_positions):
    total = len(predicted_positions)
    correct, notCorrect, notCorrectScore =  0, 0, 0
    correct_predicted = []
    
    for p, a in zip(predicted_positions, actual_positions):
        if p == a:
            correct += 1
            correct_predicted.append(p)
        else:
            notCorrect += 1
            notCorrectScore += abs(p - a)
    return {"procent":correct/total , "deviation":notCorrectScore/notCorrect , "corrected":correct_predicted}

def positionTableCreator(positionList, actuals):
    actual_positions = actuals
    model_names = ["lr", "ridge", "knn", "svr", "rf"]
    procents, deviations, corrects = [], [], []

    for name, predictions in zip(model_names, positionList):
        output = positionComparizer(predictions, actual_positions)
        procents.append(output["procent"])
        deviations.append(round(output["deviation"],2))
        corrects.append(output["corrected"])
        
    return pd.DataFrame({"models": model_names, "%": procents, "deviation": deviations, "correct":corrects})

def trainTestCreator(year):
    # returns trainX, trainY, testX, testY
    # TestDatasets
    Me2016 = Me[Me["Year"]== year]
    Me2016X_NF = Me2016.loc[:, numeric_features]
    Me2016y = Me2016.loc[:, target_features]
    
    # TrainDataSets
    MeNo2016 =  Me[Me["Year"]!= year]
    MeNo2016X_NF = MeNo2016.loc[:, numeric_features]
    MeNo2016y = MeNo2016.loc[:, target_features]
    
    return MeNo2016X_NF, MeNo2016y, Me2016X_NF, Me2016y

### Train best regression models on MeNo2016

This estimator is trained and tested on the MeNo2016!

In [5]:
# In this cell we have to initialize the pipelines with best results
kf = KFold(n_splits=5, random_state=1234, shuffle=True)

lr_pipe = Pipeline([("power", PowerTransformer()), ("RFE", SelectFromModel(RandomForestRegressor(random_state=1234))), 
                    ("lr", LinearRegression())])
ridge_pipe = Pipeline([("power", PowerTransformer()), ("var",VarianceThreshold()), ("ridge",Ridge())])
knn_pipe = Pipeline([("robust",RobustScaler()),("RFE",SelectFromModel(RandomForestRegressor(random_state=1234))), 
                    ("knn", KNeighborsRegressor())])
svr_pipe = Pipeline([("robust", RobustScaler()), ("rfe",SelectFromModel(RandomForestRegressor(random_state=1234))),
                     ("svr",SVR())])
rf_pipe = Pipeline([("robust", RobustScaler()), ("var", VarianceThreshold()), ("rf", RandomForestRegressor())]) 

pipes = [lr_pipe, ridge_pipe, knn_pipe, svr_pipe, rf_pipe]

In [6]:
def positionListCreator(year):
    import warnings
    warnings.filterwarnings('ignore')
    
    positionList = []

    trainX, trainY, testX, testY = trainTestCreator(year)

    for pipe in pipes:
        pipe.fit(trainX, trainY["Position"])
        positionList.append(time2position(pipe.predict(testX)))
    return positionList, testY["Position"]

### PositionPredictionSummary table for each year

In [26]:
pList, actualPos = positionListCreator(1972)
positionTableCreator(pList, actualPos)

Unnamed: 0,models,%,deviation,correct
0,lr,0.111111,20.12,[2]
1,ridge,0.111111,20.12,[2]
2,knn,0.111111,19.88,[2]
3,svr,0.111111,20.12,[2]
4,rf,0.0,18.33,[]
