# `streamline_with_weather_augmentation.ipynb`

### Author: Anthony Hein

#### Last updated: 11/3/2021

# Overview:

Augment the racing dataset so that each row includes additional columns which capture meterological data for this race. This data is obtained from a variety of sources.

---

## Setup

In [1]:
from datetime import datetime, timedelta
import git
import os
import re
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
BASE_DIR = git.Repo(os.getcwd(), search_parent_directories=True).working_dir
BASE_DIR

'/Users/anthonyhein/Desktop/SML310/project'

In [3]:
import sys

sys.path.append(f'{BASE_DIR}/src/')

import geocoding_client as gc

---

## Load `horses_selected.csv`

In [4]:
horses_selected = pd.read_csv(f"{BASE_DIR}/data/streamline/horses_selected.csv", low_memory=False) 
horses_selected.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place
0,302858,Kings Return,6.0,4.0,0.6,1,W P Mullins,D J Casey,1,,...,,,King's Ride,Browne's Return,Deep Run,6,1.219263,73,1.0,1.0
1,302858,Majestic Red I,6.0,5.0,0.047619,0,John Hackett,Conor O'Dwyer,2,8,...,,,Long Pond,Courtlough Lady,Giolla Mear,6,1.219263,73,0.0,1.0
2,302858,Clearly Canadian,6.0,2.0,0.166667,0,D T Hughes,G Cotter,3,1.5,...,,,Nordico,Over The Seas,North Summit,6,1.219263,71,0.0,0.0
3,302858,Bernestic Wonder,8.0,1.0,0.058824,0,E McNamara,J Old Jones,4,dist,...,,,Roselier,Miss Reindeer,Reindeer,6,1.219263,73,0.0,0.0
4,302858,Beauty's Pride,5.0,6.0,0.038462,0,J J Lennon,T Martin,5,dist,...,,,Noalto,Elena's Beauty,Tarqogan,6,1.219263,66,0.0,0.0


In [5]:
horses_selected.shape

(205138, 27)

---

## Load `races_selected_augment_non_weather.csv`

In [6]:
races_augment_non_weather = pd.read_csv(
    f"{BASE_DIR}/data/streamline/races_selected_augment_non_weather.csv",
    low_memory=False
) 
races_augment_non_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,2nd_place_rank_in_odds,3rd_place_rank_in_odds,1st_rank_in_odds_place,2nd_rank_in_odds_place,3rd_rank_in_odds_place,placeAvailable,showAvailable,favoriteWon,favoritePlaced,favoriteShowed
0,302858,Thurles (IRE),01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,5,3,1,6,3,1,0,1,1,0
1,291347,Punchestown (IRE),03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,2,8,4,2,4,1,1,0,0,0
2,75447,Listowel (IRE),03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,2,6,5,2,1,1,1,0,0,0
3,358038,Punchestown (IRE),02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,3,5,1,4,2,1,1,1,1,1
4,78982,Dundalk (IRE),05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,4,2,1,3,3,0,0,1,0,0


In [7]:
races_augment_non_weather.shape

(20574, 31)

In [8]:
races_selected_augment_with_weather = races_augment_non_weather.copy()
races_selected_augment_with_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,2nd_place_rank_in_odds,3rd_place_rank_in_odds,1st_rank_in_odds_place,2nd_rank_in_odds_place,3rd_rank_in_odds_place,placeAvailable,showAvailable,favoriteWon,favoritePlaced,favoriteShowed
0,302858,Thurles (IRE),01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,5,3,1,6,3,1,0,1,1,0
1,291347,Punchestown (IRE),03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,2,8,4,2,4,1,1,0,0,0
2,75447,Listowel (IRE),03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,2,6,5,2,1,1,1,0,0,0
3,358038,Punchestown (IRE),02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,3,5,1,4,2,1,1,1,1,1
4,78982,Dundalk (IRE),05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,4,2,1,3,3,0,0,1,0,0


---

## Load `ireland_stations_metadata.csv`

In [9]:
ireland_stations_metadata = pd.read_csv(f"{BASE_DIR}/data/csv/ireland_stations_metadata.csv", low_memory=False) 
ireland_stations_metadata.head()

Unnamed: 0,County,Station Number,name,Height(m),Easting,Northing,Latitude,Longitude,Open Year,Close Year
0,Westmeath,2222,MULLINGAR S.W.S.,111,242700,252700,53.312,-7.212,1943,1974
1,Monaghan,2437,CLONES,89,250000,326300,54.11,-7.14,1950,2008
2,Galway,2021,GALWAY S.W.S.,20,132700,225600,53.1634,-9.0034,1978,1990
3,Offaly,4919,BIRR,72,207400,204400,53.0525,-7.5325,1954,2009
4,Kilkenny,3613,KILKENNY,65,249400,157400,52.3955,-7.161,1957,2008


In [10]:
ireland_stations_metadata.shape

(33, 10)

---

## Load `weather_all.csv`

In [11]:
weather_all = pd.read_csv(f"{BASE_DIR}/data/csv/weather_all.csv", low_memory=False) 
weather_all.head()

Unnamed: 0,date,temp,msl,rain,rhum,Station number
0,8/13/03 9:00,17.3,1023.6,0.0,72,375
1,8/13/03 10:00,18.5,1023.7,0.0,65,375
2,8/13/03 11:00,19.3,1023.7,0.0,56,375
3,8/13/03 12:00,20.4,1023.3,0.0,53,375
4,8/13/03 13:00,20.9,1023.3,0.0,54,375


In [12]:
weather_all.shape

(4250224, 6)

---

## Augment w/ Location

Let's get a set of all distinct courses.

In [13]:
courses = set(races_selected_augment_with_weather['course'])
print(len(courses))
print(courses)

31
{'Wexford (RH) (IRE)', 'Ballinrobe (IRE)', 'Fairyhouse (IRE)', 'Tipperary (IRE)', 'Clonmel (IRE)', 'Curragh (IRE)', 'Tralee (IRE)', 'Cork (IRE)', 'Dundalk (IRE)', 'Phoenix Park (IRE)', 'Punchestown (IRE)', 'Mallow (IRE)', 'Dundalk (AW) (IRE)', 'Wexford (IRE)', 'Limerick (IRE)', 'Killarney (IRE)', 'Sligo (IRE)', 'Thurles (IRE)', 'Roscommon (IRE)', 'Galway (IRE)', 'Down Royal (IRE)', 'Navan (IRE)', 'Kilbeggan (IRE)', 'Leopardstown (IRE)', 'Listowel (IRE)', 'Bellewstown (IRE)', 'Downpatrick (IRE)', 'Gowran Park (IRE)', 'Tramore (IRE)', 'Naas (IRE)', 'Laytown (IRE)'}


Not quite good enough, since there are odd repeats due to the parentheses.

In [14]:
courses = set([course[:course.find(" (")] for course in races_selected_augment_with_weather['course']])
print(len(courses))
print(courses)

29
{'Punchestown', 'Navan', 'Down Royal', 'Mallow', 'Killarney', 'Kilbeggan', 'Roscommon', 'Bellewstown', 'Leopardstown', 'Tipperary', 'Clonmel', 'Listowel', 'Dundalk', 'Phoenix Park', 'Curragh', 'Wexford', 'Ballinrobe', 'Sligo', 'Fairyhouse', 'Tralee', 'Tramore', 'Laytown', 'Naas', 'Galway', 'Downpatrick', 'Cork', 'Thurles', 'Limerick', 'Gowran Park'}


This is better. We will quickly fix this on the dataframe.

In [15]:
races_selected_augment_with_weather['course'] = races_selected_augment_with_weather['course'].map(
    lambda x: x[:x.find(" (")]
)
races_selected_augment_with_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,2nd_place_rank_in_odds,3rd_place_rank_in_odds,1st_rank_in_odds_place,2nd_rank_in_odds_place,3rd_rank_in_odds_place,placeAvailable,showAvailable,favoriteWon,favoritePlaced,favoriteShowed
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,5,3,1,6,3,1,0,1,1,0
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,2,8,4,2,4,1,1,0,0,0
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,2,6,5,2,1,1,1,0,0,0
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,3,5,1,4,2,1,1,1,1,1
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,4,2,1,3,3,0,0,1,0,0


Now, we find the location for each course.

In [16]:
course_to_location = {}
unknown = set()

for course in tqdm(courses):
    location, err = gc.get_course_location(course, 'IE')
    if err != None:
        unknown.add(course)
    else:
        course_to_location[course] = location

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:15<00:00,  1.89it/s]


In [17]:
len(course_to_location), len(unknown)

(28, 1)

In [18]:
# manually check these locations
for course, location in course_to_location.items():
    confirmation_str = input(f"{course} {location['lat']} {location['lng']}: ")
    if confirmation_str != 'y':
        lat, lng = [float(x) for x in confirmation_str.split(',')]
        location = {'lat': lat, 'lng': lng}
        course_to_location[course] = location

Punchestown 53.1858201 -6.6292925: y
Navan 53.683279 -6.6736824: y
Down Royal 54.4869619 -6.1277164: y
Mallow 52.132177 -8.685995: y
Killarney 52.0490085 -9.5146494: y
Kilbeggan 53.385273 -7.4918169: y
Roscommon 53.6470328 -8.2229038: y
Leopardstown 53.266888 -6.1954649: y
Tipperary 52.4997 -8.2068701: y
Clonmel 52.36489479999999 -7.682519899999999: y
Listowel 52.4369125 -9.4866352: y
Dundalk 54.020808 -6.3832501: y
Phoenix Park 40.3458945 -80.6882814: 53.3725119,-6.3463222
Curragh 53.167717 -6.839608: y
Wexford 52.3388569 -6.4919351: y
Ballinrobe 53.6382888 -9.244663599999999: y
Sligo 54.2620085 -8.4631764: y
Fairyhouse 53.4842453 -6.467504: y
Tralee 52.278648 -9.672246: y
Tramore 52.17149 -7.148737: y
Laytown 53.6851752 -6.2399165: y
Naas 53.2193372 -6.647107999999999: y
Galway 53.298151 -8.996836: y
Downpatrick 54.3155479 -5.728537999999999: y
Cork 52.132177 -8.685995: y
Thurles 52.6889221 -7.836620799999999: y
Limerick 52.5906552 -8.6956279: y
Gowran Park 52.6198333 -7.069867299999

In [19]:
# manually annotate the unknown courses
for course in unknown:
    location_str = input(f"{course}: ")
    lat, lng = [float(x) for x in location_str.split(',')]
    location = {'lat': lat, 'lng': lng}
    course_to_location[course] = location

Bellewstown: 53.6444121,-6.3486429


In [20]:
assert len(course_to_location) == len(courses)

Now that we calculated the location, append it to the races dataframe.

In [None]:
sys.path.append(f'{BASE_DIR}/utils/streamline/')

from course_to_location import COURSE_TO_LOCATION

course_to_location = COURSE_TO_LOCATION

In [21]:
rename_cols = {
    'index': 'course',
}

df_locations = pd.DataFrame.from_dict(course_to_location, orient='index').reset_index().rename(columns=rename_cols)
df_locations.head()

Unnamed: 0,course,lat,lng
0,Punchestown,53.18582,-6.629293
1,Navan,53.683279,-6.673682
2,Down Royal,54.486962,-6.127716
3,Mallow,52.132177,-8.685995
4,Killarney,52.049008,-9.514649


In [22]:
races_selected_augment_with_weather = races_selected_augment_with_weather.merge(df_locations, how='left', on='course')
races_selected_augment_with_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,1st_rank_in_odds_place,2nd_rank_in_odds_place,3rd_rank_in_odds_place,placeAvailable,showAvailable,favoriteWon,favoritePlaced,favoriteShowed,lat,lng
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,1,6,3,1,0,1,1,0,52.688922,-7.836621
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,4,2,4,1,1,0,0,0,53.18582,-6.629293
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,5,2,1,1,1,0,0,0,52.436912,-9.486635
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,1,4,2,1,1,1,1,1,53.18582,-6.629293
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,1,3,3,0,0,1,0,0,54.020808,-6.38325


Save the mapping of course to location to prevent computing this again.

In [23]:
s = f"COURSE_TO_LOCATION = {course_to_location}"
s[:100]

"COURSE_TO_LOCATION = {'Punchestown': {'lat': 53.1858201, 'lng': -6.6292925}, 'Navan': {'lat': 53.683"

In [24]:
with open(f"{BASE_DIR}/utils/streamline/course_to_location.py", 'w', encoding='utf-8') as f:
    f.write(s)

---

## Augment w/ Datetime

This station should satisfy some properties:
* the station should be open during the time of this race
* of all stations open during the time of this race, the station should be the closest

First, we clean the `date` column. The problem is that there appears extraneous "00:00" following the "yy/mm/dd" in the `date` column. **Note that we have not done a thorough cleaning of the dataset in its entirety yet. We are only cleaning _small_ amounts as needed until this step.**

In [25]:
def clean_date(date: str) -> str:
    return date[:date.find(' 00:00')] if date.find(' 00:00') >= 0 else date

In [26]:
races_selected_augment_with_weather['date'] = races_selected_augment_with_weather['date'].map(clean_date)
races_selected_augment_with_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,1st_rank_in_odds_place,2nd_rank_in_odds_place,3rd_rank_in_odds_place,placeAvailable,showAvailable,favoriteWon,favoritePlaced,favoriteShowed,lat,lng
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,1,6,3,1,0,1,1,0,52.688922,-7.836621
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,4,2,4,1,1,0,0,0,53.18582,-6.629293
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,5,2,1,1,1,0,0,0,52.436912,-9.486635
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,1,4,2,1,1,1,1,1,53.18582,-6.629293
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,1,3,3,0,0,1,0,0,54.020808,-6.38325


Now, we add a `datetime` column. Note that the `time` column in the original dataframe is in London TZ but we want UTC. 

In [27]:
def row_to_utc(row: pd.core.frame.DataFrame) -> datetime:
    return datetime.strptime(row['date'] + ' ' + row['time'] + ' PM', '%y/%m/%d %I:%M %p') - timedelta(hours=1)

In [28]:
races_selected_augment_with_weather['datetime'] = races_selected_augment_with_weather.apply(row_to_utc, axis=1)
races_selected_augment_with_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,2nd_rank_in_odds_place,3rd_rank_in_odds_place,placeAvailable,showAvailable,favoriteWon,favoritePlaced,favoriteShowed,lat,lng,datetime
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,6,3,1,0,1,1,0,52.688922,-7.836621,1997-01-09 12:15:00
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,2,4,1,1,0,0,0,53.18582,-6.629293,1997-02-16 14:40:00
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,2,1,1,1,0,0,0,52.436912,-9.486635,1997-03-01 14:00:00
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,4,2,1,1,1,1,1,53.18582,-6.629293,1997-04-24 13:40:00
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,3,3,0,0,1,0,0,54.020808,-6.38325,1997-05-02 16:15:00


---

## Augment w/ Closest Open Station

Precompute a list of stations sorted by distance to the station for each course.

In [29]:
# https://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points
from math import radians, cos, sin, asin, sqrt
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [30]:
course_lat_lng_triple = races_selected_augment_with_weather[['course', 'lat', 'lng']].drop_duplicates()
course_lat_lng_triple.head()

Unnamed: 0,course,lat,lng
0,Thurles,52.688922,-7.836621
1,Punchestown,53.18582,-6.629293
2,Listowel,52.436912,-9.486635
4,Dundalk,54.020808,-6.38325
6,Tipperary,52.4997,-8.20687


In [31]:
course_to_nearest_stations = {}

for _, course in tqdm(course_lat_lng_triple.iterrows()):
    distances = []
    for _, station in ireland_stations_metadata.iterrows():
        distance = haversine(course['lat'], course['lng'], station['Latitude'], station['Longitude'])
        distances.append((station['Station Number'], distance))
    distances = sorted(distances, key=lambda x: x[1])
    course_to_nearest_stations[course['course']] = distances

29it [00:00, 401.15it/s]


In [32]:
course_to_nearest_stations['Thurles'][:3]

[(1475, 38.59079410593577),
 (4919, 45.288812876734994),
 (3613, 56.14289999137384)]

Another helper function, this time for seeing if a station is open at a given time.

In [33]:
def station_is_open(row: pd.core.frame.DataFrame, race_date: datetime) -> bool:
    return (datetime.strptime(str(row['Open Year']), '%Y') < race_date) and \
           (datetime.strptime(str(row['Close Year']), '%Y') > race_date)

Now, for each race, we can find its corresponding sorted list of distances and find the first in the list that is open and collecting data over an interval which contains the race date.

In [34]:
rid_to_station = {}

for _, row in tqdm(races_selected_augment_with_weather.iterrows()):
    
    nearest_stations = course_to_nearest_stations[row['course']]
    
    for station in nearest_stations:
        
        station_row = ireland_stations_metadata[ireland_stations_metadata['Station Number'] == station[0]].iloc[0]
        
        if station_is_open(station_row, row['datetime']):
            d = station_row.to_dict()
            d['dist to station'] = station[1]
            rid_to_station[row['rid']] = d
            break

20574it [00:14, 1419.99it/s]


In [35]:
rename_cols = {
    'index': 'rid',
    'Station Number': 'station no',
    'name': 'station name',
    'Latitude': 'station lat',
    'Longitude': 'station lng',
}

drop_cols = ['County', 'Height(m)', 'Easting', 'Northing', 'Open Year', 'Close Year']

df_stations = pd.DataFrame.from_dict(rid_to_station, orient='index') \
                          .reset_index() \
                          .rename(columns=rename_cols) \
                          .drop(columns=drop_cols)
df_stations.head()

Unnamed: 0,rid,station no,station name,station lat,station lng,dist to station
0,302858,4919,BIRR,53.0525,-7.5325,45.288813
1,291347,3723,CASEMENT,53.182,-6.262,24.477602
2,75447,518,SHANNON AIRPORT,52.4125,-8.5505,63.534139
3,358038,3723,CASEMENT,53.182,-6.262,24.477602
4,78982,2437,CLONES,54.11,-7.14,50.368275


In [36]:
races_selected_augment_with_weather = races_selected_augment_with_weather.merge(df_stations, how='inner', on='rid')
races_selected_augment_with_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,favoritePlaced,favoriteShowed,lat,lng,datetime,station no,station name,station lat,station lng,dist to station
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,1,0,52.688922,-7.836621,1997-01-09 12:15:00,4919,BIRR,53.0525,-7.5325,45.288813
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,0,0,53.18582,-6.629293,1997-02-16 14:40:00,3723,CASEMENT,53.182,-6.262,24.477602
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,0,0,52.436912,-9.486635,1997-03-01 14:00:00,518,SHANNON AIRPORT,52.4125,-8.5505,63.534139
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,1,1,53.18582,-6.629293,1997-04-24 13:40:00,3723,CASEMENT,53.182,-6.262,24.477602
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,0,0,54.020808,-6.38325,1997-05-02 16:15:00,2437,CLONES,54.11,-7.14,50.368275


---

## Augment w/ Closest Weather Reading

Here, we take advantage of the fact that the entries for a given station number are in time order.

In [37]:
def nearest_weather_reading(row: pd.core.frame.DataFrame):
    """
    Return index of the nearest weather reading and its distance from the query time.
    """
    
    rid = row['rid']
    dt = row['datetime']
    
    # we already know what station we are looking for
    weather_at_station = weather_all[weather_all['Station number'] == row['station no']]
    
    # binary search through the list
    # https://stackoverflow.com/questions/23681948/get-index-of-closest-value-with-binary-search
    
    lo = 0
    hi = len(weather_at_station) - 1
    
    best_ind = lo
    best_entry_time = datetime.strptime(weather_at_station.iloc[lo]['date'], '%m/%d/%y %H:%M')
    
    while lo <= hi:
        
        mid = lo + (hi - lo) // 2
        
        entry_time = datetime.strptime(weather_at_station.iloc[mid]['date'], '%m/%d/%y %H:%M')
        
        if entry_time < dt:
            lo = mid + 1
            
        elif entry_time > dt:
            hi = mid - 1
            
        else:
            best_ind = mid
            best_entry_time = datetime.strptime(weather_at_station.iloc[best_ind]['date'], '%m/%d/%y %H:%M')
            break
    
        if abs(entry_time - dt) < abs(best_entry_time - dt):
            best_ind = mid
            best_entry_time = datetime.strptime(weather_at_station.iloc[best_ind]['date'], '%m/%d/%y %H:%M')

    weather = weather_all.loc[weather_at_station.iloc[best_ind].name].to_dict() 
    weather['goodness'] = abs(best_entry_time - dt).total_seconds() / 60
    
    return weather

In [38]:
races_selected_augment_with_weather.iloc[0]

rid                                             302858
course                                         Thurles
time                                             01:15
date                                          97/01/09
title                     Liffey Maiden Hurdle (Div 1)
rclass                                             NaN
band                                               NaN
ages                                              5yo+
distance                                          2m3f
condition                                         Good
hurdles                                            NaN
prizes                                              []
winningTime                                      277.2
prize                                              NaN
metric                                          3821.0
countryCode                                         IE
ncond                                                1
class                                                0
runners   

In [39]:
nearest_weather_reading(races_selected_augment_with_weather.iloc[0])

{'date': '1/9/97 12:00',
 'temp': 1.6,
 'msl': 1012.4,
 'rain': 0.0,
 'rhum': 87,
 'Station number': 4919,
 'goodness': 15.0}

Proceed with this helper function.

In [40]:
rid_to_weather = {}

for _, row in tqdm(races_selected_augment_with_weather.iterrows()): 
    rid_to_weather[row['rid']] = nearest_weather_reading(row)

20574it [05:44, 59.66it/s]


In [41]:
rename_cols = {
    'index': 'rid',
    'date': 'station reading date',
    'goodness': 'station reading timedelta'
}

drop_cols = ['Station number']

df_weather = pd.DataFrame.from_dict(rid_to_weather, orient='index') \
                         .reset_index() \
                         .rename(columns=rename_cols) \
                         .drop(columns=drop_cols)
df_weather.head()

Unnamed: 0,rid,station reading date,temp,msl,rain,rhum,station reading timedelta
0,302858,1/9/97 12:00,1.6,1012.4,0.0,87,15.0
1,291347,2/16/97 15:00,8.0,992.5,0.4,87,20.0
2,75447,3/1/97 14:00,12.0,1003.5,0.0,73,0.0
3,358038,4/24/97 14:00,12.6,1011.9,0.0,72,20.0
4,78982,5/2/97 14:00,21.3,1021.4,0.0,44,135.0


In [42]:
races_selected_augment_with_weather = races_selected_augment_with_weather.merge(df_weather, how='inner', on='rid')
races_selected_augment_with_weather.head()

Unnamed: 0,rid,course,time,date,title,rclass,band,ages,distance,condition,...,station name,station lat,station lng,dist to station,station reading date,temp,msl,rain,rhum,station reading timedelta
0,302858,Thurles,01:15,97/01/09,Liffey Maiden Hurdle (Div 1),,,5yo+,2m3f,Good,...,BIRR,53.0525,-7.5325,45.288813,1/9/97 12:00,1.6,1012.4,0.0,87,15.0
1,291347,Punchestown,03:40,97/02/16,Ericsson G.S.M. Grand National Trial Handicap ...,,,5yo+,3m2f,Soft,...,CASEMENT,53.182,-6.262,24.477602,2/16/97 15:00,8.0,992.5,0.4,87,20.0
2,75447,Listowel,03:00,97/03/01,Ballybunion E.B.F. Beginners S'chase,,,4yo+,2m2f,Soft,...,SHANNON AIRPORT,52.4125,-8.5505,63.534139,3/1/97 14:00,12.0,1003.5,0.0,73,0.0
3,358038,Punchestown,02:40,97/04/24,Quinns Of Baltinglass Chase (La Touche) (Cross...,,,5yo+,4m1f,Good,...,CASEMENT,53.182,-6.262,24.477602,4/24/97 14:00,12.6,1011.9,0.0,72,20.0
4,78982,Dundalk,05:15,97/05/02,Carlingford Handicap Chase,,0-109,4yo+,3m,Firm,...,CLONES,54.11,-7.14,50.368275,5/2/97 14:00,21.3,1021.4,0.0,44,135.0


---

## Review All Columns

In [43]:
races_selected_augment_with_weather.columns

Index(['rid', 'course', 'time', 'date', 'title', 'rclass', 'band', 'ages',
       'distance', 'condition', 'hurdles', 'prizes', 'winningTime', 'prize',
       'metric', 'countryCode', 'ncond', 'class', 'runners', 'margin',
       '1st_place_rank_in_odds', '2nd_place_rank_in_odds',
       '3rd_place_rank_in_odds', '1st_rank_in_odds_place',
       '2nd_rank_in_odds_place', '3rd_rank_in_odds_place', 'placeAvailable',
       'showAvailable', 'favoriteWon', 'favoritePlaced', 'favoriteShowed',
       'lat', 'lng', 'datetime', 'station no', 'station name', 'station lat',
       'station lng', 'dist to station', 'station reading date', 'temp', 'msl',
       'rain', 'rhum', 'station reading timedelta'],
      dtype='object')

---

## Save Dataframe

In [44]:
races_selected_augment_with_weather.to_csv(f"{BASE_DIR}/data/streamline/races_selected_augment_with_weather.csv",
                                           index=False)

---