In [1]:
# takes 45m for 300,000 board results.
# Warning: Huge memory requirements until rewritten. Minimun memory requirement is 64GB and large pagefile.
# 32GB probably isn't sufficient for 100,000.
# Performs following steps:
# create acbl board results df and write to file.
# 1) Read sqlite db into dataframes.
# 2) Create a single dataframe suitable for board/player analysis.
# 3) Write dataframe to disk using dataframe-friendly format. SQL isn't df friendly so using pickle, parquet or such.

# Next steps:
# acbl_board_results_cleanup.ipynb cleans acbl_board_results_raw file.

# Previous steps:
# download-acbl-details-make-json.ipynb

In [2]:
# requirements:
# conda install didn't work. Gave pyarrow/brotli error. stackoverflow recommended pip install which worked.
#   must restart jupyter. as of this date, conda installs v3 whereas pip installs v4.
# pip install pyarrow
# pip install brotlipy or maybe conda install -c conda-forge brotli


In [3]:
# todo:
# only process PAIRS?
# enable del to minimize memory usage?
# output arrow or parquet instead of sql? sql takes too long and then runs out of memory.
# why is tricks_taken erroring out? must have a str cell.
# don't create columns for spd,spns, spew. Use dict/list instead.

In [4]:
import config
import pandas as pd
import pathlib
import sqlalchemy
import mlBridgeLib

In [5]:
# override pandas display options
mlBridgeLib.pd_options_display()

In [6]:
rootPath = pathlib.Path('e:/bridge/data')
acblPath = rootPath.joinpath('acbl')

In [7]:
# using pathlib to create sqlite path.
db_connection_string = 'sqlite:///'+acblPath.joinpath('acbl-details.sqlite').as_posix()
db_connection_string

'sqlite:///e:/bridge/data/acbl/acbl-details.sqlite'

In [8]:
def db_to_frames_dict(engine):
    meta = sqlalchemy.MetaData()
    meta.reflect(engine)
    tables = meta.tables.keys()
    cnx = engine.raw_connection()
    return {t: pd.read_sql(f'SELECT * FROM {t}', cnx ) for t in tables}

In [9]:
# create dict of sql SELECT statements.
sql_selects_d = {
    'events':'SELECT id AS event_id, club_id_number, type AS event_type, board_scoring_method, tb_count, club_session FROM events',
    'board_results':'SELECT id AS board_result_id, board_id, round_number, table_number, ns_pair, ew_pair, ns_score, ew_score, contract, declarer, ew_match_points, ns_match_points, opening_lead, result, tricks_taken FROM board_results',
    'boards':'SELECT id AS board_id, section_id, board_number FROM boards',
    'pair_summaries':'SELECT id AS pair_summary_id, section_id, pair_number, direction FROM pair_summaries',
    'players':'SELECT id AS player_id, pair_summary_id, id_number AS player_number, name AS player_name, mp_total FROM players',
    'sessions':'SELECT id AS session_id, event_id, hand_record_id, game_date FROM sessions',
    'sections':'SELECT id AS section_id, session_id, name AS section_name FROM sections'
    }

In [10]:
# takes 11m for db with 300,000 result files. REQUIRES at least 32GB of memory, possibly more!
# read sql tables individually to conserve memory. board_results is huge, 90% of total. Total size is at least 4GB.
# sql results are 10 * tmp_file_size.
total_file_size = 0
for k,v in sql_selects_d.items():
    engine = sqlalchemy.create_engine(db_connection_string) #, echo=True)
    with engine.connect() as con:
        if False:
            dfs = db_to_frames_dict(engine)
        else:
            #con = engine.raw_connection
            # caution: line might end with LIMIT n or WHERE board_result_id < n
            print(f'{k}:{v}')
            tmp_file_path = acblPath.joinpath(k+'.tmp.pkl')
            pd.read_sql(v, con).to_pickle(tmp_file_path)
            tmp_file_size = tmp_file_path.stat().st_size
            total_file_size += tmp_file_size
            print(f'size:{tmp_file_size}/{total_file_size}')
    engine.dispose()
print(f'total size:{total_file_size}')

events:SELECT id AS event_id, club_id_number, type AS event_type, board_scoring_method, tb_count, club_session FROM events
size:20771784/20771784
board_results:SELECT id AS board_result_id, board_id, round_number, table_number, ns_pair, ew_pair, ns_score, ew_score, contract, declarer, ew_match_points, ns_match_points, opening_lead, result, tricks_taken FROM board_results
size:5501634881/5522406665
boards:SELECT id AS board_id, section_id, board_number FROM boards
size:180628409/5703035074
pair_summaries:SELECT id AS pair_summary_id, section_id, pair_number, direction FROM pair_summaries
size:155895990/5858931064
players:SELECT id AS player_id, pair_summary_id, id_number AS player_number, name AS player_name, mp_total FROM players
size:596240105/6455171169
sessions:SELECT id AS session_id, event_id, hand_record_id, game_date FROM sessions
size:14254597/6469425766
sections:SELECT id AS section_id, session_id, name AS section_name FROM sections
size:6144098/6475569864
total size:647556986

In [12]:
# takes 30s
# read back selected data info a dictionary of dataframes. Requires 4 * total_filesize of memory.
# Currently requires 15GB of memory.
dfs = {}
for k,v in sql_selects_d.items():
    tmp_file_path = acblPath.joinpath(k+'.tmp.pkl')
    print(f'Reading:{tmp_file_path}')
    dfs[k] = pd.read_pickle(tmp_file_path)
dfs.keys()

Reading:e:\bridge\data\acbl\events.tmp.pkl
Reading:e:\bridge\data\acbl\board_results.tmp.pkl
Reading:e:\bridge\data\acbl\boards.tmp.pkl
Reading:e:\bridge\data\acbl\pair_summaries.tmp.pkl
Reading:e:\bridge\data\acbl\players.tmp.pkl
Reading:e:\bridge\data\acbl\sessions.tmp.pkl
Reading:e:\bridge\data\acbl\sections.tmp.pkl


dict_keys(['events', 'board_results', 'boards', 'pair_summaries', 'players', 'sessions', 'sections'])

In [13]:
# creates list of keys headed by 'events' and followed by sorted list
for k in (dict.fromkeys(['events']+sorted(dfs))):
    display(k,dfs[k])

'events'

Unnamed: 0,event_id,club_id_number,event_type,board_scoring_method,tb_count,club_session
0,100961,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
1,110886,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
2,114736,100040,PAIRS,MATCH_POINTS,5.00,Monday Evening
3,126385,100040,PAIRS,MATCH_POINTS,5.50,Monday Evening
4,126418,100040,PAIRS,MATCH_POINTS,4.00,Monday Evening
5,135492,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening
6,137856,100040,PAIRS,MATCH_POINTS,8.00,Wednesday Evening
7,140329,100040,PAIRS,MATCH_POINTS,9.00,Wednesday Evening
8,143200,100040,PAIRS,MATCH_POINTS,9.00,Wednesday Evening
9,146486,100040,PAIRS,MATCH_POINTS,9.00,Wednesday Evening


'board_results'

Unnamed: 0,board_result_id,board_id,round_number,table_number,ns_pair,ew_pair,ns_score,ew_score,contract,declarer,ew_match_points,ns_match_points,opening_lead,result,tricks_taken
0,22686741,3181214,3.00,3.00,4,2,140,-140,2 S,S,0.50,1.50,DK,+1,9
1,22686740,3181214,2.00,4.00,6,5,140,-140,3 S,S,0.50,1.50,HA,=,9
2,22686742,3181214,5.00,2.00,7,3,-50,50,4 S,S,2.00,0.00,DK,-1,9
3,22686743,3181215,2.00,4.00,6,5,-450,450,4 S,E,1.50,0.50,C5,+1,11
4,22686744,3181215,3.00,3.00,4,2,-450,450,4 S,E,1.50,0.50,C6,+1,11
5,22686745,3181215,5.00,2.00,7,3,-420,420,4 S,E,0.00,2.00,C6,=,10
6,22686747,3181216,3.00,3.00,4,2,90,-90,2 D,N,1.00,1.00,S9,=,8
7,22686746,3181216,2.00,4.00,6,5,400,-400,2 NT,E,0.00,2.00,D9,-4,4
8,22686748,3181216,5.00,2.00,7,3,-50,50,2 NT,S,2.00,0.00,S3,-1,7
9,22686749,3181217,2.00,4.00,6,5,100,-100,3 D,W,1.00,1.00,HK,-1,8


'boards'

Unnamed: 0,board_id,section_id,board_number
0,3181214,119248,1
1,3181215,119248,2
2,3181216,119248,3
3,3181217,119248,4
4,3181218,119248,5
5,3181219,119248,6
6,3181220,119248,7
7,3181221,119248,8
8,3181222,119248,9
9,3181223,119248,10


'pair_summaries'

Unnamed: 0,pair_summary_id,section_id,pair_number,direction
0,1848542,119248,4,
1,1848539,119248,1,
2,1848536,119248,3,
3,1848540,119248,6,
4,1848537,119248,2,
5,1848538,119248,5,
6,1848541,119248,7,
7,2010256,129563,4,
8,2010250,129563,3,
9,2010252,129563,5,


'players'

Unnamed: 0,player_id,pair_summary_id,player_number,player_name,mp_total
0,3695653,1848542,8052557,"Robson, Dave",693.89
1,3695652,1848542,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.00
2,3695646,1848539,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.00
3,3695647,1848539,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",
4,3695641,1848536,5637562,"Williams, Peter",139.21
5,3695640,1848536,8360774,"Higgins, Pauline(swap names)",1164.08
6,3695648,1848540,tmp:07c9f971-1117-4e23-b771-7b2eaebfb1b3,"Penhale, Russ",11.00
7,3695649,1848540,tmp:c6ae1a9b-fd89-42b0-8c5d-1b294b811c97,"Charlesworth, Jack",2.00
8,3695642,1848537,5466520,"Stevens, Mikelle",321.97
9,3695643,1848537,tmp:8bf5e062-32a8-4b17-bbb1-07ee7a14c5be,"Sanderson, Val",11.00


'sections'

Unnamed: 0,section_id,session_id,section_name
0,119248,100961,M
1,129563,110886,A
2,133527,114736,F
3,145352,126385,T
4,145382,126418,B
5,155573,135492,A
6,158226,137856,A
7,161020,140329,A
8,164356,143200,A
9,168149,146486,A


'sessions'

Unnamed: 0,session_id,event_id,hand_record_id,game_date
0,100961,100961,77908,2020-02-17 00:00:00
1,110886,110886,SHUFFLE,2020-02-24 00:00:00
2,114736,114736,86634,2020-03-02 00:00:00
3,126385,126385,94064,2020-03-09 00:00:00
4,126418,126418,94082,2020-03-16 00:00:00
5,135492,135492,,2020-05-06 00:00:00
6,137856,137856,,2020-05-13 00:00:00
7,140329,140329,,2020-05-20 00:00:00
8,143200,143200,,2020-05-27 00:00:00
9,146486,146486,,2020-06-03 00:00:00


In [14]:
# todo: implement well_known_astype_conversions here? Create dict from columns of tables. Some done already.
if False:
    well_known_astype_conversions = {'board_result_id': 'UInt64', 'board_id': 'UInt64', 'round_number': 'UInt8',
        'table_number': 'UInt8', 'ns_pair': 'UInt8','ew_pair': 'UInt8', 'ns_score': 'str', 'ew_score': 'str',
        'contract': 'category', 'declarer': 'category','ew_match_points': 'float32',
        'ns_match_points': 'float32', 'opening_lead': 'category', 'result': 'str',
        'tricks_taken': 'Int8', 'section_id': 'UInt64', 'board_number': 'UInt8', 'session_id': 'UInt64',
        'section_name': 'category', 'player_id_ns': 'object', 'player_number_ns': 'object',
        'player_name_ns': 'object', 'player_id_ew': 'object', 'player_number_ew': 'object',
        'player_name_ew': 'object', 'mp_total_ns': 'object', 'mp_total_ew': 'object'}
    for c,t in well_known_astype_conversions.items():
        print(c,t)
        df[c] = df[c].astype(t)
    df.info()

In [15]:
# todo: only want sections which are pair events. events.type=='PAIRS'

In [16]:
# takes 30s.
# merge board and board_results dataframes on 'board_id'.
br_b_df = pd.merge(dfs['board_results'],dfs['boards'],left_on='board_id',right_on='board_id') #,how='inner',indicator=True)
#del dfs['board_results']
#del dfs['boards']
br_b_df

Unnamed: 0,board_result_id,board_id,round_number,table_number,ns_pair,ew_pair,ns_score,ew_score,contract,declarer,ew_match_points,ns_match_points,opening_lead,result,tricks_taken,section_id,board_number
0,22686741,3181214,3.00,3.00,4,2,140,-140,2 S,S,0.50,1.50,DK,+1,9,119248,1
1,22686740,3181214,2.00,4.00,6,5,140,-140,3 S,S,0.50,1.50,HA,=,9,119248,1
2,22686742,3181214,5.00,2.00,7,3,-50,50,4 S,S,2.00,0.00,DK,-1,9,119248,1
3,22686743,3181215,2.00,4.00,6,5,-450,450,4 S,E,1.50,0.50,C5,+1,11,119248,2
4,22686744,3181215,3.00,3.00,4,2,-450,450,4 S,E,1.50,0.50,C6,+1,11,119248,2
5,22686745,3181215,5.00,2.00,7,3,-420,420,4 S,E,0.00,2.00,C6,=,10,119248,2
6,22686747,3181216,3.00,3.00,4,2,90,-90,2 D,N,1.00,1.00,S9,=,8,119248,3
7,22686746,3181216,2.00,4.00,6,5,400,-400,2 NT,E,0.00,2.00,D9,-4,4,119248,3
8,22686748,3181216,5.00,2.00,7,3,-50,50,2 NT,S,2.00,0.00,S3,-1,7,119248,3
9,22686749,3181217,2.00,4.00,6,5,100,-100,3 D,W,1.00,1.00,HK,-1,8,119248,4


In [17]:
# takes 10s.
# merge sections dataframe on 'section_id'.
br_b_sections_df = pd.merge(br_b_df,dfs['sections'],left_on='section_id',right_on='section_id') #,how='inner',indicator=True)
#del br_b_df
#del dfs['sections']
br_b_sections_df

Unnamed: 0,board_result_id,board_id,round_number,table_number,ns_pair,ew_pair,ns_score,ew_score,contract,declarer,ew_match_points,ns_match_points,opening_lead,result,tricks_taken,section_id,board_number,session_id,section_name
0,22686741,3181214,3.00,3.00,4,2,140,-140,2 S,S,0.50,1.50,DK,+1,9,119248,1,100961,M
1,22686740,3181214,2.00,4.00,6,5,140,-140,3 S,S,0.50,1.50,HA,=,9,119248,1,100961,M
2,22686742,3181214,5.00,2.00,7,3,-50,50,4 S,S,2.00,0.00,DK,-1,9,119248,1,100961,M
3,22686743,3181215,2.00,4.00,6,5,-450,450,4 S,E,1.50,0.50,C5,+1,11,119248,2,100961,M
4,22686744,3181215,3.00,3.00,4,2,-450,450,4 S,E,1.50,0.50,C6,+1,11,119248,2,100961,M
5,22686745,3181215,5.00,2.00,7,3,-420,420,4 S,E,0.00,2.00,C6,=,10,119248,2,100961,M
6,22686747,3181216,3.00,3.00,4,2,90,-90,2 D,N,1.00,1.00,S9,=,8,119248,3,100961,M
7,22686746,3181216,2.00,4.00,6,5,400,-400,2 NT,E,0.00,2.00,D9,-4,4,119248,3,100961,M
8,22686748,3181216,5.00,2.00,7,3,-50,50,2 NT,S,2.00,0.00,S3,-1,7,119248,3,100961,M
9,22686749,3181217,2.00,4.00,6,5,100,-100,3 D,W,1.00,1.00,HK,-1,8,119248,4,100961,M


In [18]:
# takes 10s.
# merge sessions dataframe on 'session_id'.
br_b_sections_sessions_df = pd.merge(br_b_sections_df,dfs['sessions'],left_on='session_id',right_on='session_id') #,how='inner',indicator=True)
#del br_b_sections_df
#del dfs['sessions']
br_b_sections_sessions_df

Unnamed: 0,board_result_id,board_id,round_number,table_number,ns_pair,ew_pair,ns_score,ew_score,contract,declarer,ew_match_points,ns_match_points,opening_lead,result,tricks_taken,section_id,board_number,session_id,section_name,event_id,hand_record_id,game_date
0,22686741,3181214,3.00,3.00,4,2,140,-140,2 S,S,0.50,1.50,DK,+1,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00
1,22686740,3181214,2.00,4.00,6,5,140,-140,3 S,S,0.50,1.50,HA,=,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00
2,22686742,3181214,5.00,2.00,7,3,-50,50,4 S,S,2.00,0.00,DK,-1,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00
3,22686743,3181215,2.00,4.00,6,5,-450,450,4 S,E,1.50,0.50,C5,+1,11,119248,2,100961,M,100961,77908,2020-02-17 00:00:00
4,22686744,3181215,3.00,3.00,4,2,-450,450,4 S,E,1.50,0.50,C6,+1,11,119248,2,100961,M,100961,77908,2020-02-17 00:00:00
5,22686745,3181215,5.00,2.00,7,3,-420,420,4 S,E,0.00,2.00,C6,=,10,119248,2,100961,M,100961,77908,2020-02-17 00:00:00
6,22686747,3181216,3.00,3.00,4,2,90,-90,2 D,N,1.00,1.00,S9,=,8,119248,3,100961,M,100961,77908,2020-02-17 00:00:00
7,22686746,3181216,2.00,4.00,6,5,400,-400,2 NT,E,0.00,2.00,D9,-4,4,119248,3,100961,M,100961,77908,2020-02-17 00:00:00
8,22686748,3181216,5.00,2.00,7,3,-50,50,2 NT,S,2.00,0.00,S3,-1,7,119248,3,100961,M,100961,77908,2020-02-17 00:00:00
9,22686749,3181217,2.00,4.00,6,5,100,-100,3 D,W,1.00,1.00,HK,-1,8,119248,4,100961,M,100961,77908,2020-02-17 00:00:00


In [19]:
# takes 25s
# merge events dataframe on 'event_id'.
br_b_sections_sessions_events_df = pd.merge(br_b_sections_sessions_df,dfs['events'],left_on='event_id',right_on='event_id') #,how='inner',indicator=True)
#del br_b_sections_sessions_df
#del dfs['events']
br_b_sections_sessions_events_df

Unnamed: 0,board_result_id,board_id,round_number,table_number,ns_pair,ew_pair,ns_score,ew_score,contract,declarer,ew_match_points,ns_match_points,opening_lead,result,tricks_taken,section_id,board_number,session_id,section_name,event_id,hand_record_id,game_date,club_id_number,event_type,board_scoring_method,tb_count,club_session
0,22686741,3181214,3.00,3.00,4,2,140,-140,2 S,S,0.50,1.50,DK,+1,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
1,22686740,3181214,2.00,4.00,6,5,140,-140,3 S,S,0.50,1.50,HA,=,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
2,22686742,3181214,5.00,2.00,7,3,-50,50,4 S,S,2.00,0.00,DK,-1,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
3,22686743,3181215,2.00,4.00,6,5,-450,450,4 S,E,1.50,0.50,C5,+1,11,119248,2,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
4,22686744,3181215,3.00,3.00,4,2,-450,450,4 S,E,1.50,0.50,C6,+1,11,119248,2,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
5,22686745,3181215,5.00,2.00,7,3,-420,420,4 S,E,0.00,2.00,C6,=,10,119248,2,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
6,22686747,3181216,3.00,3.00,4,2,90,-90,2 D,N,1.00,1.00,S9,=,8,119248,3,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
7,22686746,3181216,2.00,4.00,6,5,400,-400,2 NT,E,0.00,2.00,D9,-4,4,119248,3,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
8,22686748,3181216,5.00,2.00,7,3,-50,50,2 NT,S,2.00,0.00,S3,-1,7,119248,3,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening
9,22686749,3181217,2.00,4.00,6,5,100,-100,3 D,W,1.00,1.00,HK,-1,8,119248,4,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening


In [20]:
# takes 8m
#spns_keys = dict(zip([t+('NS',) for t in br_b_sections_sessions_events_df[['section_id','ns_pair']].apply(tuple, axis=1)], br_b_sections_sessions_events_df['board_result_id']))
#spew_keys = dict(zip([t+('EW',) for t in br_b_sections_sessions_events_df[['section_id','ew_pair']].apply(tuple, axis=1)], br_b_sections_sessions_events_df['board_result_id']))
#spd_keys = {}
#spd_keys.update(spns_keys)
#spd_keys.update(spew_keys)
#spd_keys

In [21]:
# takes 14m
# create columns 'spns' and 'spew' to aid merging of sections_pairs dataframe later on.
# todo: don't store spns, spew in df?
br_b_sections_pairs_df = br_b_sections_sessions_events_df.copy()
br_b_sections_pairs_df['spns'] = [t+('NS',) for t in br_b_sections_sessions_events_df[['section_id','ns_pair']].apply(tuple, axis=1)]
br_b_sections_pairs_df['spew'] = [t+('EW',) for t in br_b_sections_sessions_events_df[['section_id','ew_pair']].apply(tuple, axis=1)]
# del br_b_sections_sessions_events_df
br_b_sections_pairs_df

Unnamed: 0,board_result_id,board_id,round_number,table_number,ns_pair,ew_pair,ns_score,ew_score,contract,declarer,ew_match_points,ns_match_points,opening_lead,result,tricks_taken,section_id,board_number,session_id,section_name,event_id,hand_record_id,game_date,club_id_number,event_type,board_scoring_method,tb_count,club_session,spns,spew
0,22686741,3181214,3.00,3.00,4,2,140,-140,2 S,S,0.50,1.50,DK,+1,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 4, NS)","(119248, 2, EW)"
1,22686740,3181214,2.00,4.00,6,5,140,-140,3 S,S,0.50,1.50,HA,=,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 6, NS)","(119248, 5, EW)"
2,22686742,3181214,5.00,2.00,7,3,-50,50,4 S,S,2.00,0.00,DK,-1,9,119248,1,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 7, NS)","(119248, 3, EW)"
3,22686743,3181215,2.00,4.00,6,5,-450,450,4 S,E,1.50,0.50,C5,+1,11,119248,2,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 6, NS)","(119248, 5, EW)"
4,22686744,3181215,3.00,3.00,4,2,-450,450,4 S,E,1.50,0.50,C6,+1,11,119248,2,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 4, NS)","(119248, 2, EW)"
5,22686745,3181215,5.00,2.00,7,3,-420,420,4 S,E,0.00,2.00,C6,=,10,119248,2,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 7, NS)","(119248, 3, EW)"
6,22686747,3181216,3.00,3.00,4,2,90,-90,2 D,N,1.00,1.00,S9,=,8,119248,3,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 4, NS)","(119248, 2, EW)"
7,22686746,3181216,2.00,4.00,6,5,400,-400,2 NT,E,0.00,2.00,D9,-4,4,119248,3,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 6, NS)","(119248, 5, EW)"
8,22686748,3181216,5.00,2.00,7,3,-50,50,2 NT,S,2.00,0.00,S3,-1,7,119248,3,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 7, NS)","(119248, 3, EW)"
9,22686749,3181217,2.00,4.00,6,5,100,-100,3 D,W,1.00,1.00,HK,-1,8,119248,4,100961,M,100961,77908,2020-02-17 00:00:00,100040,PAIRS,MATCH_POINTS,3.50,Monday Evening,"(119248, 6, NS)","(119248, 5, EW)"


In [22]:
#br_b_sections_pairkeydf = br_b_sections_df.copy()
#pair_keys = dfs['pair_summaries'][['section_id','pair_number','direction']].apply(tuple, axis=1)
#pair_keys = dict(zip(dfs['pair_summaries'][['section_id','pair_number','direction']].apply(tuple, axis=1), dfs['pair_summaries']['pair_summary_id']))
#pair_keys

In [23]:
# takes 1m30s.
# merge pair_summaries on pair_summary_id.
br_b_pair_players_df = pd.merge(dfs['pair_summaries'],dfs['players'],left_on='pair_summary_id',right_on='pair_summary_id') #,how='inner',indicator=True)
br_b_pair_players_df['spd'] = br_b_pair_players_df[['section_id','pair_number','direction']].apply(tuple, axis=1)
br_b_pair_players_df.set_index('spd', inplace=True)
br_b_pair_players_df.update('"' + br_b_pair_players_df[['player_name']].astype(str) + '"') # quotes 'player_name' column
br_b_pair_players_df

Unnamed: 0_level_0,pair_summary_id,section_id,pair_number,direction,player_id,player_number,player_name,mp_total
spd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"(119248, 4, None)",1848542,119248,4,,3695653,8052557,"""Robson, Dave""",693.89
"(119248, 4, None)",1848542,119248,4,,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"""Gamache, Len(swap names)""",19.00
"(119248, 1, None)",1848539,119248,1,,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"""Steele, Bruce""",16.00
"(119248, 1, None)",1848539,119248,1,,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"""Hlady, Henny""",
"(119248, 3, None)",1848536,119248,3,,3695641,5637562,"""Williams, Peter""",139.21
"(119248, 3, None)",1848536,119248,3,,3695640,8360774,"""Higgins, Pauline(swap names)""",1164.08
"(119248, 6, None)",1848540,119248,6,,3695648,tmp:07c9f971-1117-4e23-b771-7b2eaebfb1b3,"""Penhale, Russ""",11.00
"(119248, 6, None)",1848540,119248,6,,3695649,tmp:c6ae1a9b-fd89-42b0-8c5d-1b294b811c97,"""Charlesworth, Jack""",2.00
"(119248, 2, None)",1848537,119248,2,,3695642,5466520,"""Stevens, Mikelle""",321.97
"(119248, 2, None)",1848537,119248,2,,3695643,tmp:8bf5e062-32a8-4b17-bbb1-07ee7a14c5be,"""Sanderson, Val""",11.00


In [24]:
# takes 9m
# group player data on 'spd'.
gb = br_b_pair_players_df.groupby(['spd']) # ['spd','section_id','pair_number','direction']
gbdf = gb[['player_id','player_number','player_name','mp_total']].agg(list) #.reset_index()
gbdf

Unnamed: 0_level_0,player_id,player_number,player_name,mp_total
spd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(3, 1, NS)","[94, 93]","[5697522, 2170833]","[""Martin, Betty"", ""Robbins, Bob(swap names)""]","[6194.71, 2978.84]"
"(3, 2, NS)","[87, 88]","[1302604, 2728524]","[""Campbell, Patrick"", ""Bahry, Sharon""]","[18.37, 3329.61]"
"(3, 3, NS)","[104, 103]","[2589702, 1835181]","[""Merritt, Steven"", ""Kleckner, Patricia(swap names)""]","[177.8, 261.78]"
"(3, 4, NS)","[95, 96]","[8644756, 5827191]","[""Spalding, Lorita"", ""Lane, Kathy""]","[2741.9, 2677.4]"
"(3, 5, NS)","[92, 91]","[4402758, 5280141]","[""Jensen, Virginia"", ""Petersen, George(swap names)""]","[1328.7, 745.93]"
"(3, 6, NS)","[101, 102]","[5437121, 7604602]","[""Mall, Jacquelyn"", ""Disbrow, Lynford""]","[219.08, 6.22]"
"(3, 7, NS)","[100, 99]","[5753813, 5159881]","[""Evans, Kathryn"", ""Weiss, Rick(swap names)""]","[1225.39, 1951.3]"
"(3, 8, NS)","[98, 97]","[7892268, 5549337]","[""Clarke, George"", ""Kornbluth, Bobbi(swap names)""]","[1047.93, 1068.37]"
"(3, 9, NS)","[90, 89]","[2123991, 4695429]","[""Scoggin, Richard"", ""Nojima, Kazuko(swap names)""]","[3834.03, 1581.76]"
"(3, 10, NS)","[86, 85]","[2351811, 8594279]","[""Lucks, Sybil"", ""Gupta, Satish(swap names)""]","[2638.8, 2885.69]"


In [25]:
# takes 1m30s
# merge sections_pairs dataframe on 'sp??' and spd.
br_b_sections_pairs_players_df = pd.merge(br_b_sections_pairs_df,gbdf,left_on='spns',right_on='spd')
br_b_sections_pairs_players_df = pd.merge(br_b_sections_pairs_players_df,gbdf,left_on='spew',right_on='spd',suffixes=('_ns','_ew'))
br_b_sections_pairs_players_df

Unnamed: 0,board_result_id,board_id,round_number,table_number,ns_pair,ew_pair,ns_score,ew_score,contract,declarer,ew_match_points,ns_match_points,opening_lead,result,tricks_taken,section_id,board_number,session_id,section_name,event_id,hand_record_id,game_date,club_id_number,event_type,board_scoring_method,tb_count,club_session,spns,spew,player_id_ns,player_number_ns,player_name_ns,mp_total_ns,player_id_ew,player_number_ew,player_name_ew,mp_total_ew
0,29145890,4065740,,,1,1,-140,140,2S,W,4.50,1.50,,,,155573,1,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 1, NS)","(155573, 1, EW)","[4832775, 4832776]","[tmp:995300bb-6081-4285-a93a-9342d2e7bc1b, tmp:a659aa2e-1428-4ea4-87d3-581db1c2dfad]","[""sunfeather"", ""jakspade""]","[nan, nan]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
1,29145897,4065741,,,1,1,650,-650,5H,N,2.50,3.50,,,,155573,2,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 1, NS)","(155573, 1, EW)","[4832775, 4832776]","[tmp:995300bb-6081-4285-a93a-9342d2e7bc1b, tmp:a659aa2e-1428-4ea4-87d3-581db1c2dfad]","[""sunfeather"", ""jakspade""]","[nan, nan]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
2,29145904,4065742,,,1,1,100,-100,3N,W,2.00,4.00,,,,155573,3,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 1, NS)","(155573, 1, EW)","[4832775, 4832776]","[tmp:995300bb-6081-4285-a93a-9342d2e7bc1b, tmp:a659aa2e-1428-4ea4-87d3-581db1c2dfad]","[""sunfeather"", ""jakspade""]","[nan, nan]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
3,29145956,4065749,,,4,1,-140,140,2H,W,3.50,2.50,,,,155573,10,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 4, NS)","(155573, 1, EW)","[4832788, 4832787]","[tmp:05fb8b01-1366-4fc8-9bb4-13c7f6f012a7, tmp:abf67235-1e01-4f8e-a927-95be148ea12f]","[""valqtown"", ""whqtown(swap names)""]","[nan, nan]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
4,29145963,4065750,,,4,1,480,-480,4S,S,0.50,5.50,,,,155573,11,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 4, NS)","(155573, 1, EW)","[4832788, 4832787]","[tmp:05fb8b01-1366-4fc8-9bb4-13c7f6f012a7, tmp:abf67235-1e01-4f8e-a927-95be148ea12f]","[""valqtown"", ""whqtown(swap names)""]","[nan, nan]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
5,29145970,4065751,,,4,1,-90,90,1N,E,1.00,5.00,,,,155573,12,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 4, NS)","(155573, 1, EW)","[4832788, 4832787]","[tmp:05fb8b01-1366-4fc8-9bb4-13c7f6f012a7, tmp:abf67235-1e01-4f8e-a927-95be148ea12f]","[""valqtown"", ""whqtown(swap names)""]","[nan, nan]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
6,29146000,4065755,,,6,1,-620,620,4S,W,5.00,1.00,,,,155573,16,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 6, NS)","(155573, 1, EW)","[4832795, 4832796]","[1015699, 6996043]","[""Maureen Larson"", ""Ben Levine""]","[1078.3, 309.88]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
7,29146007,4065756,,,6,1,-150,150,2D,N,4.50,1.50,,,,155573,17,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 6, NS)","(155573, 1, EW)","[4832795, 4832796]","[1015699, 6996043]","[""Maureen Larson"", ""Ben Levine""]","[1078.3, 309.88]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
8,29146014,4065757,,,6,1,100,-100,5DX,W,0.00,6.00,,,,155573,18,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 6, NS)","(155573, 1, EW)","[4832795, 4832796]","[1015699, 6996043]","[""Maureen Larson"", ""Ben Levine""]","[1078.3, 309.88]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"
9,29145978,4065752,,,5,1,-110,110,2S,W,3.50,2.50,,,,155573,13,135492,A,135492,,2020-05-06 00:00:00,100040,PAIRS,MATCH_POINTS,7.00,Wednesday Evening,"(155573, 5, NS)","(155573, 1, EW)","[4832792, 4832791]","[5231051, 9311599]","[""Jeff Rowland"", ""Shirley Campbell(swap names)""]","[56.6, 128.2]","[4832778, 4832777]","[1872419, 1872427]","[""Anne Kendall"", ""Robert Coole(swap names)""]","[1606.12, 1765.12]"


In [26]:
#br_b_sections_pairs_players_df['tricks_taken'].isnumeric()

In [27]:
# takes 6s
# convert 'tricks_taken' to numeric.
# todo: something is wrong with tricks_taken data. maybe there's a string instead of null?
br_b_sections_pairs_players_df['tricks_taken'] = pd.to_numeric(br_b_sections_pairs_players_df['tricks_taken'])

In [28]:
# takes 2m
# tuple column ok
#br_b_sections_pairs_players_df.to_pickle('acbl-board-results_raw.pkl')

In [29]:
# takes 1m20s
# drop temporary columns 'spns', 'spew'. They were used to make merging easier and are no longer needed.
# sql doesn't allow columns which are tuples or lists. Either delete them or quote them. Probably shouldn't be in df anyway?
br_b_sections_pairs_players_df.drop(columns=['spns','spew'],inplace=True,errors='ignore') # tuples
#br_b_sections_pairs_players_df.drop(columns=['player_id','player_number','player_name'],inplace=True,errors='ignore') # lists

In [30]:
# ns_score, ew_score are usually Int16 but can contain 'PASS', 'NP', 'AVE+', 'AVE-', etc.
# result is usually Int8 but can be '=', '-+', ...
# hand_record_id has UInt64, None, '' and 'SHUFFLE'
# note: parquet doesn't handle float16

In [31]:
# takes 2m10s
# perform well know astype conversions.
# todo: assert that all columns are in the list.
# todo: making 'tricks_taken' float32 because raises conversion error when 'Int8'
well_known_astype_conversions = {'board_result_id': 'UInt64', 'board_id': 'UInt64', 'round_number': 'UInt8',
    'table_number': 'UInt8', 'ns_pair': 'UInt8','ew_pair': 'UInt8', 'ns_score': 'str', 'ew_score': 'str',
    'contract': 'category', 'declarer': 'category','ew_match_points': 'float32',
    'ns_match_points': 'float32', 'opening_lead': 'category', 'result': 'str', 'tricks_taken': 'float32',
    'section_id': 'UInt64', 'board_number': 'UInt8', 'session_id': 'UInt64', 'section_name': 'category',
    'event_id': 'UInt64', 'hand_record_id': 'str', 'game_date': 'datetime64[ns, US/Central]', 'club_id_number': 'UInt64',
    'event_type': 'category', 'board_scoring_method': 'category', 'tb_count': 'float32', 'club_session': 'category',
    'player_id_ns': 'object', 'player_number_ns': 'object', 'player_name_ns': 'object', 'player_name_ew': 'object',
    'mp_total_ns': 'object', 'mp_total_ew':'object'}
for c,t in well_known_astype_conversions.items():
    print(c,t)
    br_b_sections_pairs_players_df[c] = br_b_sections_pairs_players_df[c].astype(t)
br_b_sections_pairs_players_df.info()

board_result_id UInt64
board_id UInt64
round_number UInt8
table_number UInt8
ns_pair UInt8
ew_pair UInt8
ns_score str
ew_score str
contract category
declarer category
ew_match_points float32
ns_match_points float32
opening_lead category
result str
tricks_taken float32
section_id UInt64
board_number UInt8
session_id UInt64
section_name category
event_id UInt64
hand_record_id str
game_date datetime64[ns, US/Central]
club_id_number UInt64
event_type category
board_scoring_method category
tb_count float32
club_session category
player_id_ns object
player_number_ns object
player_name_ns object
player_name_ew object
mp_total_ns object
mp_total_ew object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 47151666 entries, 0 to 47151665
Data columns (total 35 columns):
 #   Column                Dtype                     
---  ------                -----                     
 0   board_result_id       UInt64                    
 1   board_id              UInt64                    
 2   round_num

In [32]:
# takes 2m51s
# try other compression schemes
# tuple columns not allowed
#br_b_sections_pairs_players_df.to_parquet('acbl-board-results-raw.parquet.gzip',compression='gzip')  

In [33]:
# takes 1m30s
# tuple columns not allowed
#br_b_sections_pairs_players_df.to_parquet('acbl-board-results-raw.parquet')

In [34]:
# takes 1m15s
# all done. write out file.
board_results_file = acblPath.joinpath('acbl-board-results.pkl')
br_b_sections_pairs_players_df.to_pickle(board_results_file)

In [35]:
# takes 15 minutes.
# tuple and list columns not allowed
# caution: looks like dataframe is too large for sql to handle??
#db_cgfile_connection_string = 'sqlite:///acbl-board-results.sqlite'
#engine_cgfile = sqlalchemy.create_engine(db_cgfile_connection_string) #, echo=True)
#br_b_sections_pairs_players_df.to_sql('cg', con=engine_cgfile, if_exists='replace', index_label='id')
#engine_cgfile.dispose()