In [49]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
import pickle

# Import Data and Prepare

In [2]:
data_files = ['../data/london_marathon_2019_'+str(x)+'.csv' for x in range(1, 7)]
dfs = [pd.read_csv(f) for f in data_files]
data = pd.concat(dfs)
data = shuffle(data, random_state=0).reset_index(drop=True)

In [3]:
def time_to_seconds(time):
    if ':' in time:
        time_parts = time.split(':')
        seconds = int(time_parts[-1]) + (int(time_parts[-2]) * 60)
        if len(time_parts) > 2:
            seconds += int(time_parts[-3]) * 3600
        return int(seconds)
    else:
        return np.nan

In [4]:
def seconds_to_time(time):
    hours = time // 3600
    time -= hours*3600
    hours = str(int(hours))
    if len(hours) < 2:
        hours = '0' + hours
    
    minutes = time // 60
    time -= minutes*60
    minutes = str(int(minutes))
    if len(minutes) < 2:
        minutes = '0' + minutes
        
    seconds = str(int(time))
    if len(seconds) < 2:
        seconds = '0' + seconds
    
    if hours != '00':
        return hours + ':' + minutes + ':' + seconds
    return minutes + ':' + seconds

In [5]:
for column in data.columns:
    data[column] = data[column].apply(lambda x: time_to_seconds(x))

## Remove Incomplete Rows

In [6]:
data.dropna(axis=0, inplace=True)
data.reset_index(drop=True, inplace=True)

## Split Into X and Y

In [7]:
X = data[['5', '10', '15', '20', '25', '30', '35']]
y = data['40'] + data['Finish']

In [8]:
X_train = X.loc[:4000]
X_test = X.loc[4000:]
y_train = y.loc[:4000]
y_test = y.loc[4000:]

# Build Model

In [62]:
def get_model():
    #return KNeighborsRegressor(n_neighbors=17)
    return MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,5,), random_state=1, max_iter=1000)
    #return LinearRegression()

In [10]:
def build_and_save_model(model, X, y, file_name='../models/test.pkl'):
    size_80 = int(X.shape[0] * 0.8)
    X_train = X.loc[:size_80]
    X_test = X.loc[size_80:]
    y_train = y.loc[:size_80]
    y_test = y.loc[size_80:]
    
    model.fit(X_train, y_train)
    
    print(mean_squared_error(y_test, model.predict(X_test)))
    
    with open(file_name, 'wb') as f:
        pickle.dump(model, f)
    
    return model

## After 5km

In [63]:
model = build_and_save_model(get_model(),
                             data[['5']],
                             data['10']+data['15']+data['20']+data['25']+data['30']+data['35']+data['40']+data['Finish'],
                             file_name='../models/main_model_5.pkl')

1898901.2044839964


## After 10km

In [64]:
model = build_and_save_model(get_model(),
                             data[['5', '10']],
                             data['15']+data['20']+data['25']+data['30']+data['35']+data['40']+data['Finish'],
                             file_name='../models/main_model_10.pkl')

1507424.5007138993


## After 15km

In [65]:
model = build_and_save_model(get_model(),
                             data[['5', '10', '15']],
                             data['20']+data['25']+data['30']+data['35']+data['40']+data['Finish'],
                             file_name='../models/main_model_15.pkl')

975267.0622247908


## After 20km

In [66]:
model = build_and_save_model(get_model(),
                             data[['5', '10', '15', '20']],
                             data['25']+data['30']+data['35']+data['40']+data['Finish'],
                             file_name='../models/main_model_20.pkl')

732092.335038439


## After 25km

In [67]:
model = build_and_save_model(get_model(),
                             data[['5', '10', '15', '20', '25']],
                             data['30']+data['35']+data['40']+data['Finish'],
                             file_name='../models/main_model_25.pkl')

481364.94403586065


## After 30km

In [68]:
model = build_and_save_model(get_model(),
                             data[['5', '10', '15', '20', '25', '30']],
                             data['35']+data['40']+data['Finish'],
                             file_name='../models/main_model_30.pkl')

159085.81899650797


## After 35km

In [69]:
model = build_and_save_model(get_model(),
                             data[['5', '10', '15', '20', '25', '30', '35']],
                             data['40']+data['Finish'],
                             file_name='../models/main_model_35.pkl')

47929.12765610776


## After 40km

In [70]:
model = build_and_save_model(get_model(),
                             data[['5', '10', '15', '20', '25', '30', '35', '40']],
                             data['Finish'],
                             file_name='../models/main_model_40.pkl')

10746.750166502066


In [90]:
#model = LinearRegression().fit(X_train, y_train)
#model = KNeighborsRegressor(n_neighbors=17).fit(X_train, y_train)

## Performance?

In [19]:
#mean_squared_error(y_test, model.predict(X_test))

51723.17451394709

## Predictions

In [12]:
def pred_linear_reg(splits, model=model):
    splits = [time_to_seconds(x) for x in splits]
    pred = model.predict([splits])[0]
    return seconds_to_time(pred)

In [13]:
def total_time_from_splits(splits, model=model):
    splits = [time_to_seconds(x) for x in splits]
    pred = model.predict([splits])[0]
    total_time = sum(splits) + pred
    return seconds_to_time(total_time)

In [14]:
pred_linear_reg(['14:10', '14:10', '14:14', '14:13', '14:12', '14:12', '14:12'])

'26:48'

In [15]:
total_time_from_splits(['14:10', '14:10', '14:14', '14:13', '14:12', '14:12', '14:12'])

'02:06:11'

## Output To Pickle File

In [96]:
#with open('../models/knn_test.pkl', 'wb') as f:
#    pickle.dump(model, f)

# Load Model and Make Predictions

In [97]:
#with open('../models/knn_test.pkl', 'rb') as f:
#    loaded_model = pickle.load(f)

In [17]:
my_splits = ['28:02', '54:34', '01:20:58', '01:48:19', '01:54:12', '02:18:42', '02:54:03', '03:31:06', '04:06:54', '04:20:24']
my_splits_short = my_splits[0:4] + my_splits[5:]

def total_splits_to_split(splits):
    seconds = [time_to_seconds(x) - time_to_seconds(splits[i-1]) if i > 0 else time_to_seconds(x) for i, x in enumerate(splits)]
    splits = [seconds_to_time(x) for x in seconds]
    return splits

my_splits_split = total_splits_to_split(my_splits_short)

In [18]:
total_time_from_splits(my_splits_split[:7], model=model)

'04:25:17'