# Place Predictions

In [1]:
import pymysql
import sqlalchemy
import pandas as pd
import numpy as np
import sys
from tqdm import tqdm

import urllib
import requests
import json
import time
import datetime

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import statsmodels.api as sm

import sys
import threading
from multiprocessing import Queue, Pool
import extract_current_data_sporting_life_helper_functions as hf
import importlib
importlib.reload(hf)

<module 'extract_current_data_sporting_life_helper_functions' from '/home/angus/projects/betting/tote/extract_current_data_sporting_life_helper_functions.py'>

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Get data for today from Sporting Life

In [3]:
todays_date = datetime.date.today().strftime("%Y-%m-%d")
#yyyymmdd = todays_date.strftime('%Y')+'-'+todays_date.strftime('%m')+'-'+todays_date.strftime('%d')

In [4]:
todays_date

'2021-04-24'

In [5]:
# read data from SL
dateurl = 'https://www.sportinglife.com/api/horse-racing/racing/racecards/'+todays_date
data_json = urllib.request.urlopen(dateurl).read()
data_dict = json.loads(data_json)

In [6]:
# races data
races_data = hf.get_days_races(data_dict)
races_df = pd.DataFrame(races_data, columns=hf.races_data_columns)

In [7]:
races_df.head(3)

Unnamed: 0,race_date,course,country,feed_source,surface,going,weather,meeting_id,meeting_order,race_time,age,distance,has_handicap,name,off_time,race_class,race_id,runners,winning_time
0,2021-04-24,Sandown,England,RUK,Turf,Good (Good to Firm in places),Sunny,83353,1,12:55,4YO plus,1m 7f 216y,True,bet365 Novices' Championship Final Handicap Hu...,12:55:27,2,622909,14,3m 55.9s
1,2021-04-24,Sandown,England,RUK,Turf,Good (Good to Firm in places),Sunny,83353,2,13:30,5YO plus,2m 6f 164y,False,bet365 Oaksey Chase (Grade 2) (GBB Race),13:30:49,1,622910,4,5m 53.0s
2,2021-04-24,Sandown,England,RUK,Turf,Good (Good to Firm in places),Sunny,83353,3,14:05,5YO plus,1m 7f 119y,False,bet365 Celebration Chase (Grade 1) (GBB Race),14:06:54,1,622911,7,3m 51.3s


In [8]:
# read horses data for each race
raceids_to_get = list(races_df['race_id'])
raceurls = ['https://www.sportinglife.com/api/horse-racing/race/'+str(i) for i in raceids_to_get]
p = Pool(4)
racedicts = list(tqdm(p.imap(hf.read_url, raceurls), total=len(raceurls)))

horses_data = []
for i in tqdm(racedicts):
    horses_data = horses_data + hf.get_horses_from_race(i)
    
horses_df = pd.DataFrame(horses_data, columns=hf.horse_data_columns)

100%|██████████| 204/204 [00:11<00:00, 17.05it/s]
100%|██████████| 204/204 [00:00<00:00, 2547.01it/s]


In [9]:
horses_df.head(3)

Unnamed: 0,race_date,race_id,course,surface,going,race_time,age,distance,has_handicap,off_time,...,pr_6_going,pr_6_odds,pr_6_position,pr_6_race_class,pr_6_race_id,pr_6_course_name,pr_6_run_type,pr_6_runner_count,pr_6_time,pr_6_weight
0,2021-04-24,622909,Sandown,TURF,Good (Good to Firm in places),12:55,4YO plus,1m 7f 216y,True,12:55:27,...,Good,12/1,7.0,5,597033.0,Huntingdon,N_H_FLAT,14.0,15:50:00,11-5
1,2021-04-24,622909,Sandown,TURF,Good (Good to Firm in places),12:55,4YO plus,1m 7f 216y,True,12:55:27,...,Good,1/25,1.0,3,596704.0,Wincanton,HURDLE,4.0,13:15:00,11-8
2,2021-04-24,622909,Sandown,TURF,Good (Good to Firm in places),12:55,4YO plus,1m 7f 216y,True,12:55:27,...,Standard / Slow,9/2,7.0,2,598358.0,Kempton,FLAT,9.0,19:25:00,10-0


In [10]:
horses_df.shape

(1908, 126)

## Get data in shape

In [11]:
def odds_parser(odds_string):
    try:
        odds_split = odds_string.split('/')
        decimal_odds = (int(odds_split[0])+int(odds_split[1]))/int(odds_split[1])
        return decimal_odds
    except:
        return 1

horses_df['decimal_odds'] = [odds_parser(o) for o in horses_df['betting_odds']]

#### Get places data together

In [12]:
# placing rules
ew_rules = pd.DataFrame({
    'runners': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
                1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
    'handicap': [1]*22+[0]*22,
    'ew_odds': [0,0,0,0,0.25,0.25,0.25,0.2,0.2,0.2,0.2,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,
                0,0,0,0,0.25,0.25,0.25,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2],
    'places': [0,0,0,0,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,
               0,0,0,0,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
})

In [13]:
races_handicaps = horses_df.groupby('race_id').agg({
    'runners': min,
    'number_of_placed_rides': min,
    'has_handicap': max
}).reset_index().rename(columns={'has_handicap': 'handicap'})
#races_handicaps['handicap'] = (races_handicaps['max_handicap']>0)*1
races_handicaps = races_handicaps.merge(ew_rules, how='left', on=['runners', 'handicap'])

In [14]:
sum(races_handicaps['number_of_placed_rides']!=races_handicaps['places'])/len(races_handicaps)

0.058823529411764705

In [15]:
races_handicaps[races_handicaps['number_of_placed_rides']!=races_handicaps['places']].head(5)

Unnamed: 0,race_id,runners,number_of_placed_rides,handicap,ew_odds,places
2,622897,3,1,True,0.0,0
3,622898,3,1,True,0.0,0
6,622901,4,1,False,0.0,0
7,622902,4,1,False,0.0,0
9,622904,4,1,False,0.0,0


In [16]:
races_handicaps = races_handicaps[races_handicaps['number_of_placed_rides']==races_handicaps['places']]

In [17]:
len(races_handicaps)

192

#### Subset countries

In [18]:
countries = ['scotland', 'england', 'northern ireland', 'eire', 'wales']
country_race_ids = races_df.loc[races_df['country'].str.lower().isin(countries), 'race_id']

races_handicaps = races_handicaps[races_handicaps['race_id'].isin(country_race_ids)]

In [19]:
len(races_handicaps)

41

#### Subset to prediction data

In [20]:
horses_df.iloc[:, :20].head(3)

Unnamed: 0,race_date,race_id,course,surface,going,race_time,age,distance,has_handicap,off_time,race_class,runners,winning_time,prize1,prize2,prize3,stewards,number_of_placed_rides,tote_win,place_win
0,2021-04-24,622909,Sandown,TURF,Good (Good to Firm in places),12:55,4YO plus,1m 7f 216y,True,12:55:27,2,14,3m 55.9s,38580.0 GBP,17738.0 GBP,8872.0 GBP,NONE,3,7.6 GBP,"2.6 GBP,7.3 GBP,1.6 GBP"
1,2021-04-24,622909,Sandown,TURF,Good (Good to Firm in places),12:55,4YO plus,1m 7f 216y,True,12:55:27,2,14,3m 55.9s,38580.0 GBP,17738.0 GBP,8872.0 GBP,NONE,3,7.6 GBP,"2.6 GBP,7.3 GBP,1.6 GBP"
2,2021-04-24,622909,Sandown,TURF,Good (Good to Firm in places),12:55,4YO plus,1m 7f 216y,True,12:55:27,2,14,3m 55.9s,38580.0 GBP,17738.0 GBP,8872.0 GBP,NONE,3,7.6 GBP,"2.6 GBP,7.3 GBP,1.6 GBP"


In [21]:
prediction_data = horses_df.loc[
    (horses_df['race_id'].isin(races_handicaps['race_id'])) & (horses_df['betting_odds'].notnull()) & (horses_df['ride_status']=='RUNNER'),
    ['race_id', 'race_date', 'race_time', 'course', 'horse_name', 'horse_id', 'runners', 'decimal_odds', 'finish_position']].sort_values('decimal_odds')
prediction_data = prediction_data.merge(races_handicaps, how='left', on=['race_id', 'runners'])
prediction_data['placed'] = ((prediction_data['finish_position']<=prediction_data['places']) & (prediction_data['finish_position']>0))*1
prediction_data['pred_order'] = prediction_data.groupby('race_id').cumcount()
prediction_data_piv = prediction_data.pivot_table(values=['decimal_odds'], index=['race_id', 'runners'], columns='pred_order', aggfunc='mean', fill_value=None)

In [22]:
prediction_data_piv.columns = [str(c[0])+'_'+str(c[1]) for c in prediction_data_piv.columns]

In [23]:
# reshape
max_runners_in_data = max(prediction_data['pred_order'])
max_runners = min(22, max_runners_in_data)
odds_cols = ['decimal_odds_'+str(r) for r in range(max_runners)]
prediction_data_list = []
for i in range(max_runners):
    pred_tmp = prediction_data[prediction_data['pred_order']==i]
    pred_tmp = pred_tmp.merge(prediction_data_piv, how='left', on='race_id')
    odds_cols_tmp = odds_cols.copy()
    odds_cols_tmp.remove('decimal_odds_'+str(i))
    
    pred_tmp = pred_tmp.rename(columns={'decimal_odds_'+str(i): 'decimal_odds_horse'})
    pred_tmp = pred_tmp[['race_id', 'race_date', 'race_time', 'course', 'horse_name', 'horse_id', 'runners', 'finish_position', 'placed', 'pred_order', 'decimal_odds_horse'] + odds_cols_tmp]
    pred_tmp.columns = ['race_id', 'race_date', 'race_time', 'course', 'horse_name', 'horse_id', 'runners', 'finish_position', 'placed', 'pred_order', 'decimal_odds_horse'] + odds_cols[1:]
    prediction_data_list.append(pred_tmp)

prediction_data_df = pd.concat(prediction_data_list, axis=0)

In [24]:
prediction_data_df.head(3)

Unnamed: 0,race_id,race_date,race_time,course,horse_name,horse_id,runners,finish_position,placed,pred_order,...,decimal_odds_6,decimal_odds_7,decimal_odds_8,decimal_odds_9,decimal_odds_10,decimal_odds_11,decimal_odds_12,decimal_odds_13,decimal_odds_14,decimal_odds_15
0,622924,2021-04-24,16:45,Wolverhampton,Irish Admiral,985308,5,1,1,0,...,,,,,,,,,,
1,622928,2021-04-24,18:45,Wolverhampton,Equality,966509,6,0,0,0,...,,,,,,,,,,
2,622920,2021-04-24,15:35,Haydock,Finest Sound,946392,6,1,1,0,...,,,,,,,,,,


In [25]:
prediction_data_df['all_odds'] = 1
for i in range(1, max_runners):
    prediction_data_df.loc[(prediction_data_df['runners']==i+1) & (prediction_data_df['decimal_odds_'+str(i)].isnull()), 'all_odds'] = 0

In [26]:
sum(prediction_data_df['all_odds'])

385

In [27]:
min_runners = 8
max_runners = 12

prediction_data_subset = prediction_data_df[
    (prediction_data_df['runners'].between(min_runners, max_runners)) &
    (prediction_data_df['all_odds']==1)]

## Read model and get predictions

In [28]:
import pickle

In [29]:
filename = '/home/angus/projects/betting/tote/models/places_model_0.pkl'
with open(filename, 'rb') as file:
    places_model = pickle.load(file)

In [30]:
# create feature list and also set odds to 9999 where no runner
features = ['runners', 'decimal_odds_horse']
for i in range(max_runners):
    features += ['decimal_odds_'+str(i+1)]
    prediction_data_subset.loc[prediction_data_subset['decimal_odds_'+str(i+1)].isnull(), 'decimal_odds_'+str(i+1)] = 9999

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [31]:
dpred = xgb.DMatrix(prediction_data_subset[features])

prediction_data_subset['preds'] = places_model.predict(dpred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
prediction_data_subset.head(3)

Unnamed: 0,race_id,race_date,race_time,course,horse_name,horse_id,runners,finish_position,placed,pred_order,...,decimal_odds_8,decimal_odds_9,decimal_odds_10,decimal_odds_11,decimal_odds_12,decimal_odds_13,decimal_odds_14,decimal_odds_15,all_odds,preds
5,622953,2021-04-24,16:20,Doncaster,Chief of Staff,1017023,9,5,0,0,...,151.0,9999.0,9999.0,9999.0,9999.0,,,,1,0.827092
7,622908,2021-04-24,15:30,Leicester,Kingmania,981580,10,1,1,0,...,29.0,34.0,9999.0,9999.0,9999.0,,,,1,0.737334
10,622918,2021-04-24,14:25,Haydock,Lady Rockstar,991775,8,2,1,0,...,9999.0,9999.0,9999.0,9999.0,9999.0,,,,1,0.710825


## Add place odds and compare to predictions

In [33]:
prediction_data_subset = prediction_data_subset.merge(races_handicaps, how='left', on=['race_id', 'runners'])
prediction_data_subset['ew_return'] = 1+(prediction_data_subset['decimal_odds_horse']-1)*prediction_data_subset['ew_odds']

In [34]:
prediction_data_subset['pred_odds'] = 1/prediction_data_subset['preds']

In [35]:
prediction_data_subset['actual_pred_ratio'] = prediction_data_subset['ew_return']/prediction_data_subset['pred_odds']

In [41]:
time_now = str(datetime.datetime.now().hour-1).zfill(2)+':'+str(datetime.datetime.now().minute).zfill(2)
#time_now = '10:00'

In [37]:
2.4/2.18

1.1009174311926604

In [42]:
# backs
cols_to_show = ['race_date', 'race_time', 'course', 'horse_name', 'runners', 'pred_order', 'decimal_odds_horse', 'preds',
                'number_of_placed_rides', 'handicap', 'ew_odds', 'places', 'ew_return', 'pred_odds', 'actual_pred_ratio']

prediction_data_subset.loc[prediction_data_subset['race_time']>=time_now, cols_to_show].sort_values('actual_pred_ratio', ascending=False).head(30)

Unnamed: 0,race_date,race_time,course,horse_name,runners,pred_order,decimal_odds_horse,preds,number_of_placed_rides,handicap,ew_odds,places,ew_return,pred_odds,actual_pred_ratio
8,2021-04-24,17:45,Wolverhampton,Estate House,10,0,3.5,0.602155,3,True,0.2,3,1.5,1.660701,0.903233
20,2021-04-24,17:45,Wolverhampton,Helian,10,1,3.5,0.602155,3,True,0.2,3,1.5,1.660701,0.903233
9,2021-04-24,19:15,Wolverhampton,Givepeaceachance,12,0,3.5,0.554617,3,True,0.25,3,1.625,1.803046,0.901253
111,2021-04-24,19:15,Wolverhampton,Lady Of York,12,6,12.0,0.235623,3,True,0.25,3,3.75,4.244076,0.883585
43,2021-04-24,17:45,Wolverhampton,Rue De La Gaite,10,2,6.0,0.433765,3,True,0.2,3,2.0,2.305395,0.86753
31,2021-04-24,19:15,Wolverhampton,Yagood,12,1,5.5,0.405965,3,True,0.25,3,2.125,2.463267,0.862675
61,2021-04-24,19:15,Wolverhampton,Clog Na Fola,12,3,8.0,0.313694,3,True,0.25,3,2.75,3.187817,0.862659
65,2021-04-24,17:45,Wolverhampton,Pope Gregory,10,3,8.5,0.33997,3,True,0.2,3,2.5,2.941434,0.849925
42,2021-04-24,19:15,Wolverhampton,Star Ascending,12,2,6.0,0.376919,3,True,0.25,3,2.25,2.65309,0.848068
81,2021-04-24,19:15,Wolverhampton,I'm Digby,12,4,10.0,0.259696,3,True,0.25,3,3.25,3.850652,0.844013


In [43]:
# lays?
max_ew_return = 5
prediction_data_subset.loc[(prediction_data_subset['race_time']>time_now) &
                           (prediction_data_subset['ew_return']<=max_ew_return), cols_to_show].sort_values('actual_pred_ratio', ascending=False).tail(30)

Unnamed: 0,race_date,race_time,course,horse_name,runners,pred_order,decimal_odds_horse,preds,number_of_placed_rides,handicap,ew_odds,places,ew_return,pred_odds,actual_pred_ratio
8,2021-04-24,17:45,Wolverhampton,Estate House,10,0,3.5,0.602155,3,True,0.2,3,1.5,1.660701,0.903233
20,2021-04-24,17:45,Wolverhampton,Helian,10,1,3.5,0.602155,3,True,0.2,3,1.5,1.660701,0.903233
9,2021-04-24,19:15,Wolverhampton,Givepeaceachance,12,0,3.5,0.554617,3,True,0.25,3,1.625,1.803046,0.901253
111,2021-04-24,19:15,Wolverhampton,Lady Of York,12,6,12.0,0.235623,3,True,0.25,3,3.75,4.244076,0.883585
43,2021-04-24,17:45,Wolverhampton,Rue De La Gaite,10,2,6.0,0.433765,3,True,0.2,3,2.0,2.305395,0.86753
31,2021-04-24,19:15,Wolverhampton,Yagood,12,1,5.5,0.405965,3,True,0.25,3,2.125,2.463267,0.862675
61,2021-04-24,19:15,Wolverhampton,Clog Na Fola,12,3,8.0,0.313694,3,True,0.25,3,2.75,3.187817,0.862659
65,2021-04-24,17:45,Wolverhampton,Pope Gregory,10,3,8.5,0.33997,3,True,0.2,3,2.5,2.941434,0.849925
42,2021-04-24,19:15,Wolverhampton,Star Ascending,12,2,6.0,0.376919,3,True,0.25,3,2.25,2.65309,0.848068
81,2021-04-24,19:15,Wolverhampton,I'm Digby,12,4,10.0,0.259696,3,True,0.25,3,3.25,3.850652,0.844013


In [40]:
ew_rules

Unnamed: 0,runners,handicap,ew_odds,places
0,1,1,0.0,0
1,2,1,0.0,0
2,3,1,0.0,0
3,4,1,0.0,0
4,5,1,0.25,2
5,6,1,0.25,2
6,7,1,0.25,2
7,8,1,0.2,3
8,9,1,0.2,3
9,10,1,0.2,3
