In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import pickle

# Import Data and Prepare

In [2]:
data_files = ['../data/london_marathon_2019_'+str(x)+'.csv' for x in range(1, 7)]
dfs = [pd.read_csv(f) for f in data_files]
data = pd.concat(dfs)

In [3]:
def time_to_seconds(time):
    if ':' in time:
        time_parts = time.split(':')
        seconds = int(time_parts[-1]) + (int(time_parts[-2]) * 60)
        if len(time_parts) > 2:
            seconds += int(time_parts[-3]) * 3600
        return seconds
    else:
        return np.nan

In [4]:
def seconds_to_time(time):
    hours = time // 3600
    time -= hours*3600
    hours = str(int(hours))
    if len(hours) < 2:
        hours = '0' + hours
    
    minutes = time // 60
    time -= minutes*60
    minutes = str(int(minutes))
    if len(minutes) < 2:
        minutes = '0' + minutes
        
    seconds = str(int(time))
    if len(seconds) < 2:
        seconds = '0' + seconds
    
    if hours != '00':
        return hours + ':' + minutes + ':' + seconds
    return minutes + ':' + seconds

In [5]:
for column in data.columns:
    data[column] = data[column].apply(lambda x: time_to_seconds(x))

## Remove Incomplete Rows

In [6]:
data.dropna(axis=0, inplace=True)

## Split Into X and Y

In [7]:
X = data[['5', '10', '15', '20', '25', '30', '35']]
y = data['40'] + data['Finish']

# Build Model

In [8]:
model = LinearRegression().fit(X, y)

## Predictions

In [9]:
def pred_linear_reg(splits, model=model):
    splits = [time_to_seconds(x) for x in splits]
    pred = model.predict([splits])[0]
    return seconds_to_time(pred)

In [10]:
def total_time_from_splits(splits, model=model):
    splits = [time_to_seconds(x) for x in splits]
    pred = model.predict([splits])[0]
    total_time = sum(splits) + pred
    return seconds_to_time(total_time)

In [11]:
pred_linear_reg(['14:10', '14:10', '14:14', '14:13', '14:12', '14:12', '14:12'])

'23:10'

In [12]:
total_time_from_splits(['14:10', '14:10', '14:14', '14:13', '14:12', '14:12', '14:12'])

'02:02:33'

## Output To Pickle File

In [13]:
with open('../models/linear_reg_test.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load Model and Make Predictions

In [14]:
with open('../models/linear_reg_test.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [26]:
my_splits = ['28:02', '54:34', '01:20:58', '01:48:19', '01:54:12', '02:18:42', '02:54:03', '03:31:06', '04:06:54', '04:20:24']
my_splits_short = my_splits[0:4] + my_splits[5:]

def total_splits_to_split(splits):
    seconds = [time_to_seconds(x) - time_to_seconds(splits[i-1]) if i > 0 else time_to_seconds(x) for i, x in enumerate(splits)]
    splits = [seconds_to_time(x) for x in seconds]
    return splits

my_splits_split = total_splits_to_split(my_splits_short)

In [27]:
total_time_from_splits(my_splits_split[:7], model=loaded_model)

'04:23:25'