In [1]:
import bz2
import os
import tarfile
import json
from datetime import datetime
import pandas as pd

In [2]:
class PriceUpdate():
    def __init__(self, operation_type,publish_time,event_name,event_id,market_name,open_date, market_time,inplay,runner_name,runner_id,ltp):
        self.operation_type = operation_type
        self.publish_time = publish_time
        self.event_name = event_name
        self.event_id = event_id
        self.market_name = market_name
        self.open_date = open_date
        self.market_time = market_time 
        self.inplay  = inplay
        self.runner_name = runner_name
        self.runner_id = runner_id
        self.ltp = ltp

def get_dict_from_runners(runner_list):
    runner_dict = {}
    for runner in runner_list:
        runner_dict[runner["id"]] = runner["name"]
        
    return runner_dict

def get_info_from_market_definition(market_definition):
    event_name = market_definition["eventName"]
    event_id = market_definition["eventId"]
    name = market_definition["name"]
    is_inplay = market_definition["inPlay"]
    market_time = market_definition["marketTime"]
    open_date = market_definition["openDate"]
    
    runner_dict = get_dict_from_runners(market_definition["runners"])
    
    return event_name, event_id, name, is_inplay, market_time, open_date, runner_dict

def get_all_price_updates(file_content):
    file_content_decoded = file_content.decode().strip()
    lines = file_content_decoded.split("\n")
    
    price_updates = []
    for k in range(0,len(lines)):
    
        info = json.loads(lines[k])
        operation_type = info["op"]
        publish_time = datetime.fromtimestamp(int(info["pt"])/1000)
        market_change = info["mc"][0]

        if "marketDefinition" in market_change.keys():
            event_name, event_id, market_name, is_inplay, market_time, open_date, runner_dict = get_info_from_market_definition(market_change["marketDefinition"])

        if "rc" in market_change.keys():
            runner_list = market_change["rc"]

            for val in runner_list:
                runner_id = val["id"]
                runner_name = runner_dict[val["id"]]
                ltp = val["ltp"]
                new_price = PriceUpdate(operation_type,
                                        publish_time,
                                        event_name,
                                        event_id,
                                        market_name,
                                        open_date,
                                        market_time,
                                        is_inplay,
                                        runner_name,
                                        runner_id,
                                        ltp)

                price_updates.append(new_price)
                
    return price_updates

def get_all_prices(path):
    price_updates_accumulator = []

    #iterate through year folders
    for y in os.listdir(path):
        year_path = f"{path}/{y}" 
        print(f"Getting matches for {y}..")

        #iterate through month folders
        for m in os.listdir(year_path):    
            month_path = f"{year_path}/{m}"

            #iterate through day folders
            for d in os.listdir(month_path):
                day_path = f"{month_path}/{d}"

                #iterate through match folders
                for match_file in os.listdir(day_path):
                    match_path = f"{day_path}/{match_file}"

                    for match_sub_file in os.listdir(match_path):
                        match_sub_path = f"{match_path}/{match_sub_file}"

                        file_content = bz2.open(match_sub_path).read()
                        price_updates_accumulator += get_all_price_updates(file_content)  
                        
    #convert to dataframe
    price_df = pd.DataFrame([vars(p) for p in price_updates_accumulator])
    price_df.sort_values(["open_date", "event_name", "runner_name", "publish_time"], inplace=True)   
    
    return price_updates_accumulator, price_df

In [3]:
path = "C:/Users/Afonso.sequeira/OneDrive - Entain Group/Desktop/betfair_data/Jan2018_Dec2022_raw_data/BASIC"
price_list, price_df = get_all_prices(path)

Getting matches for 2017..
Getting matches for 2018..
Getting matches for 2019..
Getting matches for 2020..
Getting matches for 2021..
Getting matches for 2022..
Getting matches for 2023..


In [4]:
print(price_df.shape)
price_df.head()

(1114845, 11)


Unnamed: 0,operation_type,publish_time,event_name,event_id,market_name,open_date,market_time,inplay,runner_name,runner_id,ltp
59659,mcm,2017-01-01 12:45:30.823,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Abdoulaye Doucoure,7647245,8.0
59704,mcm,2017-01-01 13:00:30.805,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Abdoulaye Doucoure,7647245,11.5
59731,mcm,2017-01-01 13:12:30.539,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Abdoulaye Doucoure,7647245,11.0
59857,mcm,2017-01-01 13:53:30.864,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,True,Adlene Guedioura,4506604,16.0
59569,mcm,2016-12-30 17:09:59.154,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Christian Eriksen,4540367,2.98


In [6]:
price_df.isnull().sum()

operation_type    0
publish_time      0
event_name        0
event_id          0
market_name       0
open_date         0
market_time       0
inplay            0
runner_name       0
runner_id         0
ltp               0
dtype: int64

In [5]:
price_df.to_csv("betfairhistoricalprices/goalscorer_price_data.csv", index=False)