In [1]:
# takes 11m for 6,000,000 pair summaries.
# create pair_summaries for training

# Next steps:
# acbl_board_results_cleanup.ipynb cleans acbl_board_results_raw file.

# Previous steps:
# download-acbl-details-make-json.ipynb

# todo:
# filter out non-pair events
# predict percentage for each board result
# predict rank
# combine with board results to predict tricks taken

In [2]:
import config
import pandas as pd
import pathlib
import pickle
import sqlalchemy
import sys

In [3]:
sys.path.append(str(pathlib.Path.cwd().parent.joinpath('mlBridgeLib')))
sys.path
import mlBridgeLib

In [4]:
# override pandas display options
mlBridgeLib.pd_options_display()

In [5]:
rootPath = pathlib.Path('e:/bridge/data')
acblPath = rootPath.joinpath('acbl')
bprPath = rootPath.joinpath('bpr')

In [6]:
# using pathlib to create sqlite path.
db_connection_string = 'sqlite:///'+acblPath.joinpath('acbl-details.sqlite').as_posix()
db_connection_string

'sqlite:///e:/bridge/data/acbl/acbl-details.sqlite'

In [7]:
def db_to_frames_dict(engine):
    meta = sqlalchemy.MetaData()
    meta.reflect(engine)
    tables = meta.tables.keys()
    cnx = engine.raw_connection()
    return {t: pd.read_sql(f'SELECT * FROM {t}', cnx ) for t in tables}

In [8]:
# create dict of sql SELECT statements.
sql_selects_d = {
    # WHERE is_eligible = 1 ?
    'pair_summaries':'SELECT id AS pair_summary_id, section_id, pair_number, direction, strat, percentage, is_eligible FROM pair_summaries',
    'players':'SELECT id AS player_id, pair_summary_id, id_number AS player_number, name AS player_name, mp_total FROM players',
    'awards':'SELECT player_id, id_number as player_number, type, rank FROM awards',
    'sections':'SELECT id AS section_id, session_id, name AS section_name FROM sections',
    'sessions':'SELECT id AS session_id, event_id, hand_record_id, game_date FROM sessions',
    'events':'SELECT id AS event_id, name AS event_name, club_name, club_id_number, type as event_type, rating, board_scoring_method, stratification_type, winner_type, club_class FROM events',
    'strats':'SELECT id AS strat_id, event_id, number AS strats_number, label AS strats_label, [limit] AS strats_limit FROM strats',
    'strat_place':'SELECT id AS strat_place_id, pair_summary_id, strat_number AS strat_place_number, rank AS strat_place_rank, type AS strat_place_type FROM strat_place',
    'stratValues':'SELECT id AS stratValues_id, events AS event_id, label AS stratValues_label, [limit] AS stratValues_limit FROM stratValues'
    }

In [9]:
# takes 3m
dfs = {}
for k,v in sql_selects_d.items():
    engine = sqlalchemy.create_engine(db_connection_string) #, echo=True)
    with engine.connect() as con:
        if False:
            dfs = db_to_frames_dict(engine)
        else:
            print(f'{k}:{v}')
            dfs[k] = pd.read_sql(v, con)
    engine.dispose()

pair_summaries:SELECT id AS pair_summary_id, section_id, pair_number, direction, strat, percentage, is_eligible FROM pair_summaries
players:SELECT id AS player_id, pair_summary_id, id_number AS player_number, name AS player_name, mp_total FROM players
awards:SELECT player_id, id_number as player_number, type, rank FROM awards
sections:SELECT id AS section_id, session_id, name AS section_name FROM sections
sessions:SELECT id AS session_id, event_id, hand_record_id, game_date FROM sessions
events:SELECT id AS event_id, name AS event_name, club_name, club_id_number, type as event_type, rating, board_scoring_method, stratification_type, winner_type, club_class FROM events
strats:SELECT id AS strat_id, event_id, number AS strats_number, label AS strats_label, [limit] AS strats_limit FROM strats
strat_place:SELECT id AS strat_place_id, pair_summary_id, strat_number AS strat_place_number, rank AS strat_place_rank, type AS strat_place_type FROM strat_place
stratValues:SELECT id AS stratValues_

In [10]:
for k,v in dfs.items():
    display(k,v.head(),v.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6944605 entries, 0 to 6944604
Data columns (total 7 columns):
 #   Column           Dtype  
---  ------           -----  
 0   pair_summary_id  int64  
 1   section_id       int64  
 2   pair_number      int64  
 3   direction        object 
 4   strat            float64
 5   percentage       float64
 6   is_eligible      float64
dtypes: float64(3), int64(3), object(1)
memory usage: 370.9+ MB


'pair_summaries'

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible
0,1848542,119248,4,,1.0,62.5,1.0
1,1848539,119248,1,,1.0,57.29,1.0
2,1848536,119248,3,,1.0,51.04,1.0
3,1848540,119248,6,,1.0,51.04,1.0
4,1848537,119248,2,,1.0,44.79,1.0


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13881248 entries, 0 to 13881247
Data columns (total 5 columns):
 #   Column           Dtype  
---  ------           -----  
 0   player_id        int64  
 1   pair_summary_id  int64  
 2   player_number    object 
 3   player_name      object 
 4   mp_total         float64
dtypes: float64(1), int64(2), object(2)
memory usage: 529.5+ MB


'players'

Unnamed: 0,player_id,pair_summary_id,player_number,player_name,mp_total
0,3695653,1848542,8052557,"Robson, Dave",693.89
1,3695652,1848542,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0
2,3695646,1848539,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0
3,3695647,1848539,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",
4,3695641,1848536,5637562,"Williams, Peter",139.21


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5349888 entries, 0 to 5349887
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   player_id      int64 
 1   player_number  object
 2   type           object
 3   rank           object
dtypes: int64(1), object(3)
memory usage: 163.3+ MB


'awards'

Unnamed: 0,player_id,player_number,type,rank
0,3695653,8052557,section,1
1,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,section,1
2,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,section,2
3,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,section,2
4,3695641,5637562,section,3/4


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454087 entries, 0 to 454086
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   section_id    454087 non-null  int64 
 1   session_id    454087 non-null  int64 
 2   section_name  454087 non-null  object
dtypes: int64(2), object(1)
memory usage: 10.4+ MB


'sections'

Unnamed: 0,section_id,session_id,section_name
0,119248,100961,M
1,129563,110886,A
2,133527,114736,F
3,145352,126385,T
4,145382,126418,B


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430054 entries, 0 to 430053
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   session_id      430054 non-null  int64 
 1   event_id        430054 non-null  int64 
 2   hand_record_id  375217 non-null  object
 3   game_date       430054 non-null  object
dtypes: int64(2), object(2)
memory usage: 13.1+ MB


'sessions'

Unnamed: 0,session_id,event_id,hand_record_id,game_date
0,100961,100961,77908,2020-02-17 00:00:00
1,110886,110886,SHUFFLE,2020-02-24 00:00:00
2,114736,114736,86634,2020-03-02 00:00:00
3,126385,126385,94064,2020-03-09 00:00:00
4,126418,126418,94082,2020-03-16 00:00:00


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430051 entries, 0 to 430050
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   event_id              430051 non-null  int64 
 1   event_name            430051 non-null  object
 2   club_name             430051 non-null  object
 3   club_id_number        430051 non-null  int64 
 4   event_type            430051 non-null  object
 5   rating                430051 non-null  int64 
 6   board_scoring_method  430051 non-null  object
 7   stratification_type   430051 non-null  object
 8   winner_type           430051 non-null  int64 
 9   club_class            430051 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 32.8+ MB


'events'

Unnamed: 0,event_id,event_name,club_name,club_id_number,event_type,rating,board_scoring_method,stratification_type,winner_type,club_class
0,100961,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
1,110886,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
2,114736,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
3,126385,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
4,126418,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1193974 entries, 0 to 1193973
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   strat_id       1193974 non-null  int64 
 1   event_id       1193974 non-null  int64 
 2   strats_number  1193974 non-null  int64 
 3   strats_label   1193974 non-null  object
 4   strats_limit   1193974 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 45.5+ MB


'strats'

Unnamed: 0,strat_id,event_id,strats_number,strats_label,strats_limit
0,274629,100961,1,A,0
1,301290,110886,1,A,0
2,311626,114736,1,A,0
3,342416,126385,1,A,0
4,342417,126385,2,B,100


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7037141 entries, 0 to 7037140
Data columns (total 5 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   strat_place_id      int64 
 1   pair_summary_id     int64 
 2   strat_place_number  int64 
 3   strat_place_rank    object
 4   strat_place_type    object
dtypes: int64(3), object(2)
memory usage: 268.4+ MB


'strat_place'

Unnamed: 0,strat_place_id,pair_summary_id,strat_place_number,strat_place_rank,strat_place_type
0,1997656,1848542,1,1,Section
1,1997654,1848539,1,2,Section
2,1997653,1848536,1,3/4,Section
3,1997655,1848540,1,3/4,Section
4,2172350,2010256,1,1,Section


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1193974 entries, 0 to 1193973
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   stratValues_id     1193974 non-null  object
 1   event_id           1193974 non-null  int64 
 2   stratValues_label  1193974 non-null  object
 3   stratValues_limit  1193974 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 36.4+ MB


'stratValues'

Unnamed: 0,stratValues_id,event_id,stratValues_label,stratValues_limit
0,100961-1,100961,A,0
1,110886-1,110886,A,0
2,114736-1,114736,A,0
3,126385-1,126385,A,0
4,126385-2,126385,B,100


None

In [11]:
pair_summaries_dtypes = {
    'pair_summary_id':'int64',
    'section_id':'int64',
    'pair_number':'int64',
    'direction':'string',
    'strat':'Int8',
    'percentage':'float32',
    'is_eligible':'boolean'
}
df_pair_summaries = dfs['pair_summaries']
df_pair_summaries = df_pair_summaries.astype(pair_summaries_dtypes)
display(df_pair_summaries.isna().sum(), df_pair_summaries.info(), df_pair_summaries.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6944605 entries, 0 to 6944604
Data columns (total 7 columns):
 #   Column           Dtype  
---  ------           -----  
 0   pair_summary_id  int64  
 1   section_id       int64  
 2   pair_number      int64  
 3   direction        string 
 4   strat            Int8   
 5   percentage       float32
 6   is_eligible      boolean
dtypes: Int8(1), boolean(1), float32(1), int64(3), string(1)
memory usage: 264.9 MB


pair_summary_id         0
section_id              0
pair_number             0
direction          759195
strat                  44
percentage           9421
is_eligible        889123
dtype: int64

None

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible
0,1848542,119248,4,,1,62.5,True
1,1848539,119248,1,,1,57.29,True
2,1848536,119248,3,,1,51.04,True
3,1848540,119248,6,,1,51.04,True
4,1848537,119248,2,,1,44.79,True


In [12]:
players_dtypes = {
    'player_id':'int64',
    'pair_summary_id':'int64',
    'player_number':'string',
    'player_name':'string',
    'mp_total':'float32'
}
df_players = dfs['players']
df_players = df_players.astype(players_dtypes)
display(df_players.isna().sum(), df_players.info(), df_players.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13881248 entries, 0 to 13881247
Data columns (total 5 columns):
 #   Column           Dtype  
---  ------           -----  
 0   player_id        int64  
 1   pair_summary_id  int64  
 2   player_number    string 
 3   player_name      string 
 4   mp_total         float32
dtypes: float32(1), int64(2), string(2)
memory usage: 476.6 MB


player_id               0
pair_summary_id         0
player_number         270
player_name             0
mp_total           388449
dtype: int64

None

Unnamed: 0,player_id,pair_summary_id,player_number,player_name,mp_total
0,3695653,1848542,8052557,"Robson, Dave",693.89
1,3695652,1848542,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0
2,3695646,1848539,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0
3,3695647,1848539,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",
4,3695641,1848536,5637562,"Williams, Peter",139.21


In [13]:
awards_dtypes = {
    'player_id':'int64',
    'player_number':'string',
    'type':'string',
    'rank':'string'
}
df_awards = dfs['awards']
df_awards = df_awards.astype(awards_dtypes)
display(df_awards.isna().sum(), df_awards.info(), df_awards.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5349888 entries, 0 to 5349887
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   player_id      int64 
 1   player_number  string
 2   type           string
 3   rank           string
dtypes: int64(1), string(3)
memory usage: 163.3 MB


player_id        0
player_number    0
type             0
rank             0
dtype: int64

None

Unnamed: 0,player_id,player_number,type,rank
0,3695653,8052557,section,1
1,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,section,1
2,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,section,2
3,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,section,2
4,3695641,5637562,section,3/4


In [14]:
sections_dtypes = {
    'section_id':'int64',
    'session_id':'int64',
    'section_name':'string',
}
df_sections = dfs['sections']
df_sections = df_sections.astype(sections_dtypes)
display(df_sections.isna().sum(), df_sections.info(), df_sections.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454087 entries, 0 to 454086
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   section_id    454087 non-null  int64 
 1   session_id    454087 non-null  int64 
 2   section_name  454087 non-null  string
dtypes: int64(2), string(1)
memory usage: 10.4 MB


section_id      0
session_id      0
section_name    0
dtype: int64

None

Unnamed: 0,section_id,session_id,section_name
0,119248,100961,M
1,129563,110886,A
2,133527,114736,F
3,145352,126385,T
4,145382,126418,B


In [15]:
sessions_dtypes = {
    'session_id':'int64',
    'event_id':'int64',
    'hand_record_id':'string', # can contain SHUFFLE, etc.
    'game_date':'string'
}
df_sessions = dfs['sessions']
df_sessions = df_sessions.astype(sessions_dtypes)
display(df_sessions.isna().sum(), df_sessions.info(), df_sessions.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430054 entries, 0 to 430053
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   session_id      430054 non-null  int64 
 1   event_id        430054 non-null  int64 
 2   hand_record_id  375217 non-null  string
 3   game_date       430054 non-null  string
dtypes: int64(2), string(2)
memory usage: 13.1 MB


session_id            0
event_id              0
hand_record_id    54837
game_date             0
dtype: int64

None

Unnamed: 0,session_id,event_id,hand_record_id,game_date
0,100961,100961,77908,2020-02-17 00:00:00
1,110886,110886,SHUFFLE,2020-02-24 00:00:00
2,114736,114736,86634,2020-03-02 00:00:00
3,126385,126385,94064,2020-03-09 00:00:00
4,126418,126418,94082,2020-03-16 00:00:00


In [16]:
events_dtypes = {
    'event_id':'int64',
    'event_name':'string',
    'club_name':'string',
    'club_id_number':'int64',
    'event_type':'string',
    'rating':'int8',
    'board_scoring_method':'string',
    'stratification_type':'string',
    'winner_type':'int8',
    'club_class':'int8'
}
df_events = dfs['events']
df_events = df_events.astype(events_dtypes)
display(df_events.isna().sum(), df_events.info(), df_events.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430051 entries, 0 to 430050
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   event_id              430051 non-null  int64 
 1   event_name            430051 non-null  string
 2   club_name             430051 non-null  string
 3   club_id_number        430051 non-null  int64 
 4   event_type            430051 non-null  string
 5   rating                430051 non-null  int8  
 6   board_scoring_method  430051 non-null  string
 7   stratification_type   430051 non-null  string
 8   winner_type           430051 non-null  int8  
 9   club_class            430051 non-null  int8  
dtypes: int64(2), int8(3), string(5)
memory usage: 24.2 MB


event_id                0
event_name              0
club_name               0
club_id_number          0
event_type              0
rating                  0
board_scoring_method    0
stratification_type     0
winner_type             0
club_class              0
dtype: int64

None

Unnamed: 0,event_id,event_name,club_name,club_id_number,event_type,rating,board_scoring_method,stratification_type,winner_type,club_class
0,100961,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
1,110886,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
2,114736,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
3,126385,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
4,126418,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4


In [17]:
strats_dtypes = {
    'strat_id':'string',
    'event_id':'int64',
    'strats_number':'int8',
    'strats_label':'string',
    'strats_limit':'int16'
}
df_strats = dfs['strats']
df_strats = df_strats.astype(strats_dtypes)
display(df_strats.isna().sum(), df_strats.info(), df_strats.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1193974 entries, 0 to 1193973
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   strat_id       1193974 non-null  string
 1   event_id       1193974 non-null  int64 
 2   strats_number  1193974 non-null  int8  
 3   strats_label   1193974 non-null  string
 4   strats_limit   1193974 non-null  int16 
dtypes: int16(1), int64(1), int8(1), string(2)
memory usage: 30.7 MB


strat_id         0
event_id         0
strats_number    0
strats_label     0
strats_limit     0
dtype: int64

None

Unnamed: 0,strat_id,event_id,strats_number,strats_label,strats_limit
0,274629,100961,1,A,0
1,301290,110886,1,A,0
2,311626,114736,1,A,0
3,342416,126385,1,A,0
4,342417,126385,2,B,100


In [18]:
strat_place_dtypes = {
    'strat_place_id':'int64',
    'pair_summary_id':'int64',
    'strat_place_number':'int8',
    'strat_place_rank':'string',
    'strat_place_type':'string'
}
df_strat_place = dfs['strat_place']
df_strat_place = df_strat_place.astype(strat_place_dtypes)
display(df_strat_place.isna().sum(), df_strat_place.info(), df_strat_place.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7037141 entries, 0 to 7037140
Data columns (total 5 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   strat_place_id      int64 
 1   pair_summary_id     int64 
 2   strat_place_number  int8  
 3   strat_place_rank    string
 4   strat_place_type    string
dtypes: int64(2), int8(1), string(2)
memory usage: 221.5 MB


strat_place_id        0
pair_summary_id       0
strat_place_number    0
strat_place_rank      0
strat_place_type      0
dtype: int64

None

Unnamed: 0,strat_place_id,pair_summary_id,strat_place_number,strat_place_rank,strat_place_type
0,1997656,1848542,1,1,Section
1,1997654,1848539,1,2,Section
2,1997653,1848536,1,3/4,Section
3,1997655,1848540,1,3/4,Section
4,2172350,2010256,1,1,Section


In [19]:
stratValues_dtypes = {
    'stratValues_id':'string',
    'event_id':'int64',
    'stratValues_label':'string',
    'stratValues_limit':'int32'
}
df_stratValues = dfs['stratValues']
df_stratValues = df_stratValues.astype(stratValues_dtypes)
display(df_stratValues.isna().sum(), df_stratValues.info(), df_stratValues.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1193974 entries, 0 to 1193973
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   stratValues_id     1193974 non-null  string
 1   event_id           1193974 non-null  int64 
 2   stratValues_label  1193974 non-null  string
 3   stratValues_limit  1193974 non-null  int32 
dtypes: int32(1), int64(1), string(2)
memory usage: 31.9 MB


stratValues_id       0
event_id             0
stratValues_label    0
stratValues_limit    0
dtype: int64

None

Unnamed: 0,stratValues_id,event_id,stratValues_label,stratValues_limit
0,100961-1,100961,A,0
1,110886-1,110886,A,0
2,114736-1,114736,A,0
3,126385-1,126385,A,0
4,126385-2,126385,B,100


In [20]:
# takes 11s
# create df from merging of pair_summaries and players dataframes on 'pair_summaries'.
df_pair_summaries_players = pd.merge(df_pair_summaries,df_players,on='pair_summary_id',how='inner')
df_pair_summaries_players.sort_values('pair_summary_id')

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total
6821306,41,3,12,NS,1,48.52,,81,1838512,"Yue, Raymond",1039.74
6821307,41,3,12,NS,1,48.52,,82,4446976,"Wetzel, Tutty",2358.31
6821301,42,3,11,NS,1,52.59,,83,5674158,"Kellermeyer, Virginia(swap names)",3394.11
6821300,42,3,11,NS,1,52.59,,84,3264696,"Falk, Charlotte",7601.08
6821294,43,3,10,NS,1,57.78,,86,2351811,"Lucks, Sybil",2638.80
6821295,43,3,10,NS,1,57.78,,85,8594279,"Gupta, Satish(swap names)",2885.69
6821309,44,3,2,NS,1,42.59,,88,2728524,"Bahry, Sharon",3329.61
6821308,44,3,2,NS,1,42.59,,87,1302604,"Campbell, Patrick",18.37
6821305,45,3,9,NS,1,50.00,,89,4695429,"Nojima, Kazuko(swap names)",1581.76
6821304,45,3,9,NS,1,50.00,,90,2123991,"Scoggin, Richard",3834.03


In [21]:
# todo: drop NAs
len(df_pair_summaries_players),df_pair_summaries_players.isna().sum()

(13881248,
 pair_summary_id          0
 section_id               0
 pair_number              0
 direction          1510505
 strat                   88
 percentage           18842
 is_eligible        1778246
 player_id                0
 player_number          270
 player_name              0
 mp_total            388449
 dtype: int64)

In [22]:
# takes 15s
# merge awards on player_id
# todo: assert each player_id has same player_number 
df_pair_summaries_players_awards = pd.merge(df_pair_summaries_players,df_awards,on=['player_id','player_number'],how='inner')
df_pair_summaries_players_awards.sort_values('pair_summary_id')

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank
2585375,42,3,11,NS,1,52.59,,83,5674158,"Kellermeyer, Virginia(swap names)",3394.11,section,5/6
2585374,42,3,11,NS,1,52.59,,84,3264696,"Falk, Charlotte",7601.08,section,5/6
2585369,43,3,10,NS,1,57.78,,85,8594279,"Gupta, Satish(swap names)",2885.69,section,2
2585368,43,3,10,NS,1,57.78,,86,2351811,"Lucks, Sybil",2638.80,section,2
2585367,47,3,1,NS,1,60.37,,93,2170833,"Robbins, Bob(swap names)",2978.84,section,1
2585366,47,3,1,NS,1,60.37,,94,5697522,"Martin, Betty",6194.71,section,1
2585377,48,3,4,NS,1,52.59,,96,5827191,"Lane, Kathy",2677.40,section,5/6
2585376,48,3,4,NS,1,52.59,,95,8644756,"Spalding, Lorita",2741.90,section,5/6
2585373,49,3,8,NS,2,57.04,,97,5549337,"Kornbluth, Bobbi(swap names)",1068.37,section,3/4
2585372,49,3,8,NS,2,57.04,,98,7892268,"Clarke, George",1047.93,section,3/4


In [23]:
# todo: drop NAs
len(df_pair_summaries_players_awards),df_pair_summaries_players_awards.isna().sum()

(5347901,
 pair_summary_id         0
 section_id              0
 pair_number             0
 direction          460156
 strat                   0
 percentage           7141
 is_eligible        883504
 player_id               0
 player_number           0
 player_name             0
 mp_total            70943
 type                    0
 rank                    0
 dtype: int64)

In [24]:
# takes 2s
df_pair_summaries_players_awards_sections = pd.merge(df_pair_summaries_players_awards,df_sections,on='section_id',how='inner')
df_pair_summaries_players_awards_sections.head()

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank,session_id,section_name
0,1848542,119248,4,,1,62.5,True,3695653,8052557,"Robson, Dave",693.89,section,1,100961,M
1,1848542,119248,4,,1,62.5,True,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0,section,1,100961,M
2,1848539,119248,1,,1,57.29,True,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0,section,2,100961,M
3,1848539,119248,1,,1,57.29,True,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",,section,2,100961,M
4,1848536,119248,3,,1,51.04,True,3695641,5637562,"Williams, Peter",139.21,section,3/4,100961,M


In [25]:
# todo: drop NAs
len(df_pair_summaries_players_awards_sections),df_pair_summaries_players_awards_sections.isna().sum()

(5347901,
 pair_summary_id         0
 section_id              0
 pair_number             0
 direction          460156
 strat                   0
 percentage           7141
 is_eligible        883504
 player_id               0
 player_number           0
 player_name             0
 mp_total            70943
 type                    0
 rank                    0
 session_id              0
 section_name            0
 dtype: int64)

In [26]:
# takes 2s
df_pair_summaries_players_awards_sections_sessions = pd.merge(df_pair_summaries_players_awards_sections,df_sessions,on='session_id',how='inner')
df_pair_summaries_players_awards_sections_sessions.head()

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank,session_id,section_name,event_id,hand_record_id,game_date
0,1848542,119248,4,,1,62.5,True,3695653,8052557,"Robson, Dave",693.89,section,1,100961,M,100961,77908,2020-02-17 00:00:00
1,1848542,119248,4,,1,62.5,True,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0,section,1,100961,M,100961,77908,2020-02-17 00:00:00
2,1848539,119248,1,,1,57.29,True,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0,section,2,100961,M,100961,77908,2020-02-17 00:00:00
3,1848539,119248,1,,1,57.29,True,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",,section,2,100961,M,100961,77908,2020-02-17 00:00:00
4,1848536,119248,3,,1,51.04,True,3695641,5637562,"Williams, Peter",139.21,section,3/4,100961,M,100961,77908,2020-02-17 00:00:00


In [27]:
# todo: drop NAs
len(df_pair_summaries_players_awards_sections_sessions),df_pair_summaries_players_awards_sections_sessions.isna().sum()

(5347901,
 pair_summary_id         0
 section_id              0
 pair_number             0
 direction          460156
 strat                   0
 percentage           7141
 is_eligible        883504
 player_id               0
 player_number           0
 player_name             0
 mp_total            70943
 type                    0
 rank                    0
 session_id              0
 section_name            0
 event_id                0
 hand_record_id      58633
 game_date               0
 dtype: int64)

In [28]:
# takes 5s
df_pair_summaries_players_awards_sections_sessions_events = pd.merge(df_pair_summaries_players_awards_sections_sessions,df_events,on='event_id',how='inner')
df_pair_summaries_players_awards_sections_sessions_events.head()

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank,session_id,section_name,event_id,hand_record_id,game_date,event_name,club_name,club_id_number,event_type,rating,board_scoring_method,stratification_type,winner_type,club_class
0,1848542,119248,4,,1,62.5,True,3695653,8052557,"Robson, Dave",693.89,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
1,1848542,119248,4,,1,62.5,True,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
2,1848539,119248,1,,1,57.29,True,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
3,1848539,119248,1,,1,57.29,True,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4
4,1848536,119248,3,,1,51.04,True,3695641,5637562,"Williams, Peter",139.21,section,3/4,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4


In [29]:
# todo: drop NAs
len(df_pair_summaries_players_awards_sections_sessions_events),df_pair_summaries_players_awards_sections_sessions_events.isna().sum()

(5347901,
 pair_summary_id              0
 section_id                   0
 pair_number                  0
 direction               460156
 strat                        0
 percentage                7141
 is_eligible             883504
 player_id                    0
 player_number                0
 player_name                  0
 mp_total                 70943
 type                         0
 rank                         0
 session_id                   0
 section_name                 0
 event_id                     0
 hand_record_id           58633
 game_date                    0
 event_name                   0
 club_name                    0
 club_id_number               0
 event_type                   0
 rating                       0
 board_scoring_method         0
 stratification_type          0
 winner_type                  0
 club_class                   0
 dtype: int64)

In [30]:
df_strats.sort_values('event_id').head(10)

Unnamed: 0,strat_id,event_id,strats_number,strats_label,strats_limit
624368,8,3,2,B,1500
624367,7,3,1,A,0
858160,11,5,1,A,0
625759,13,6,2,B,200
625758,12,6,1,A,1000
625845,14,7,1,A,0
625846,15,7,2,B,1500
625847,16,7,3,C,500
625938,17,8,1,A,0
625939,18,8,2,B,1000


In [31]:
# Takes 30s
# There can be multiple strats per event. Make a list of strats per event.
df_strats_agg = df_strats.groupby('event_id').agg(list)
df_strats_agg.head(10)

Unnamed: 0_level_0,strat_id,strats_number,strats_label,strats_limit
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,"[7, 8]","[1, 2]","[A, B]","[0, 1500]"
5,[11],[1],[A],[0]
6,"[12, 13]","[1, 2]","[A, B]","[1000, 200]"
7,"[14, 15, 16]","[1, 2, 3]","[A, B, C]","[0, 1500, 500]"
8,"[17, 18]","[1, 2]","[A, B]","[0, 1000]"
10,"[22, 23, 24]","[1, 2, 3]","[A, B, C]","[0, 1250, 300]"
11,"[25, 26, 27]","[1, 2, 3]","[A, B, C]","[200, 50, 20]"
12,"[28, 29, 30]","[1, 2, 3]","[A, B, C]","[0, 200, 100]"
13,"[31, 32, 33]","[1, 2, 3]","[A, B, C]","[0, 500, 100]"
14,"[34, 35]","[1, 2]","[A, B]","[1000, 100]"


In [32]:
# takes 5s
df_pair_summaries_players_awards_sections_sessions_events_strats = pd.merge(df_pair_summaries_players_awards_sections_sessions_events,df_strats_agg,on='event_id',how='inner')
df_pair_summaries_players_awards_sections_sessions_events_strats.head()

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank,session_id,section_name,event_id,hand_record_id,game_date,event_name,club_name,club_id_number,event_type,rating,board_scoring_method,stratification_type,winner_type,club_class,strat_id,strats_number,strats_label,strats_limit
0,1848542,119248,4,,1,62.5,True,3695653,8052557,"Robson, Dave",693.89,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0]
1,1848542,119248,4,,1,62.5,True,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0]
2,1848539,119248,1,,1,57.29,True,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0]
3,1848539,119248,1,,1,57.29,True,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0]
4,1848536,119248,3,,1,51.04,True,3695641,5637562,"Williams, Peter",139.21,section,3/4,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0]


In [33]:
# todo: drop NAs
len(df_pair_summaries_players_awards_sections_sessions_events_strats),df_pair_summaries_players_awards_sections_sessions_events_strats.isna().sum()

(5347901,
 pair_summary_id              0
 section_id                   0
 pair_number                  0
 direction               460156
 strat                        0
 percentage                7141
 is_eligible             883504
 player_id                    0
 player_number                0
 player_name                  0
 mp_total                 70943
 type                         0
 rank                         0
 session_id                   0
 section_name                 0
 event_id                     0
 hand_record_id           58633
 game_date                    0
 event_name                   0
 club_name                    0
 club_id_number               0
 event_type                   0
 rating                       0
 board_scoring_method         0
 stratification_type          0
 winner_type                  0
 club_class                   0
 strat_id                     0
 strats_number                0
 strats_label                 0
 strats_limit                 

In [34]:
# Takes 5m30s!
# There can be multiple strat_places per pair_summary_id. Make a list of strat_places per pair_summary_id.
df_strat_place_agg = df_strat_place.groupby('pair_summary_id').agg(list)
df_strat_place_agg.head(10)

Unnamed: 0_level_0,strat_place_id,strat_place_number,strat_place_rank,strat_place_type
pair_summary_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,[48],[1],[5/6],[Section]
43,"[49, 50]","[1, 1]","[2, 2]","[Section, Event]"
47,"[51, 52]","[1, 1]","[1, 1]","[Section, Event]"
48,[53],[1],[5/6],[Section]
49,"[54, 55, 56]","[1, 1, 2]","[3/4, 3/4, 1]","[Section, Event, Section]"
50,"[57, 58]","[1, 1]","[3/4, 3/4]","[Section, Event]"
52,[59],[2],[2],[Section]
65,[66],[1],[2/4],[Section]
66,[67],[1],[2/4],[Section]
67,[68],[1],[2/4],[Section]


In [35]:
# takes 8s
df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place = pd.merge(df_pair_summaries_players_awards_sections_sessions_events_strats,df_strat_place_agg,on=['pair_summary_id'],how='inner')
df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place.head()

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank,session_id,section_name,event_id,hand_record_id,game_date,event_name,club_name,club_id_number,event_type,rating,board_scoring_method,stratification_type,winner_type,club_class,strat_id,strats_number,strats_label,strats_limit,strat_place_id,strat_place_number,strat_place_rank,strat_place_type
0,1848542,119248,4,,1,62.5,True,3695653,8052557,"Robson, Dave",693.89,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997656],[1],[1],[Section]
1,1848542,119248,4,,1,62.5,True,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997656],[1],[1],[Section]
2,1848539,119248,1,,1,57.29,True,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997654],[1],[2],[Section]
3,1848539,119248,1,,1,57.29,True,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997654],[1],[2],[Section]
4,1848536,119248,3,,1,51.04,True,3695641,5637562,"Williams, Peter",139.21,section,3/4,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997653],[1],[3/4],[Section]


In [36]:
# todo: drop NAs
len(df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place),df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place.isna().sum()

(5292044,
 pair_summary_id              0
 section_id                   0
 pair_number                  0
 direction               452667
 strat                        0
 percentage                6951
 is_eligible             869457
 player_id                    0
 player_number                0
 player_name                  0
 mp_total                 69444
 type                         0
 rank                         0
 session_id                   0
 section_name                 0
 event_id                     0
 hand_record_id           58470
 game_date                    0
 event_name                   0
 club_name                    0
 club_id_number               0
 event_type                   0
 rating                       0
 board_scoring_method         0
 stratification_type          0
 winner_type                  0
 club_class                   0
 strat_id                     0
 strats_number                0
 strats_label                 0
 strats_limit                 

In [37]:
# Takes 25s
# There can be multiple stratValues per event. Make a list of stratValues per event.
df_stratValues_agg = df_stratValues.groupby('event_id').agg(list)
df_stratValues_agg.head(10)

Unnamed: 0_level_0,stratValues_id,stratValues_label,stratValues_limit
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,"[3-1, 3-2]","[A, B]","[0, 1500]"
5,[5-1],[A],[0]
6,"[6-1, 6-2]","[A, B]","[1000, 200]"
7,"[7-1, 7-2, 7-3]","[A, B, C]","[0, 1500, 500]"
8,"[8-1, 8-2]","[A, B]","[0, 1000]"
10,"[10-1, 10-2, 10-3]","[A, B, C]","[0, 1250, 300]"
11,"[11-1, 11-2, 11-3]","[A, B, C]","[200, 50, 20]"
12,"[12-1, 12-2, 12-3]","[A, B, C]","[0, 200, 100]"
13,"[13-1, 13-2, 13-3]","[A, B, C]","[0, 500, 100]"
14,"[14-1, 14-2]","[A, B]","[1000, 100]"


In [38]:
# takes 5s
df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place_stratValues = pd.merge(df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place,df_stratValues_agg,on='event_id',how='inner')
df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place_stratValues.head()

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank,session_id,section_name,event_id,hand_record_id,game_date,event_name,club_name,club_id_number,event_type,rating,board_scoring_method,stratification_type,winner_type,club_class,strat_id,strats_number,strats_label,strats_limit,strat_place_id,strat_place_number,strat_place_rank,strat_place_type,stratValues_id,stratValues_label,stratValues_limit
0,1848542,119248,4,,1,62.5,True,3695653,8052557,"Robson, Dave",693.89,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997656],[1],[1],[Section],[100961-1],[A],[0]
1,1848542,119248,4,,1,62.5,True,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997656],[1],[1],[Section],[100961-1],[A],[0]
2,1848539,119248,1,,1,57.29,True,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997654],[1],[2],[Section],[100961-1],[A],[0]
3,1848539,119248,1,,1,57.29,True,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997654],[1],[2],[Section],[100961-1],[A],[0]
4,1848536,119248,3,,1,51.04,True,3695641,5637562,"Williams, Peter",139.21,section,3/4,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997653],[1],[3/4],[Section],[100961-1],[A],[0]


In [39]:
# todo: drop NAs
len(df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place_stratValues),df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place_stratValues.isna().sum()

(5292044,
 pair_summary_id              0
 section_id                   0
 pair_number                  0
 direction               452667
 strat                        0
 percentage                6951
 is_eligible             869457
 player_id                    0
 player_number                0
 player_name                  0
 mp_total                 69444
 type                         0
 rank                         0
 session_id                   0
 section_name                 0
 event_id                     0
 hand_record_id           58470
 game_date                    0
 event_name                   0
 club_name                    0
 club_id_number               0
 event_type                   0
 rating                       0
 board_scoring_method         0
 stratification_type          0
 winner_type                  0
 club_class                   0
 strat_id                     0
 strats_number                0
 strats_label                 0
 strats_limit                 

In [40]:
allpr_file = acblPath.joinpath('allpr.pkl')
df_allpr = pd.read_pickle(allpr_file)
df_allpr['player_number'] = df_allpr['player_id'].map(lambda r: r[0] if len(r) else None).astype('string')
# 'Rank', 'PR', 'Games', 'Unit' errored if 'Int...' (why?) so made 'float32'
df_allpr_dtypes = {'Rank':'float32','PR':'float32','Games':'float32','Unit':'string','Name':'string','player_id':'string'}
df_allpr = df_allpr.astype(df_allpr_dtypes)
df_allpr.rename({'Rank':'bpr_rank','PR':'bpr_pr','Games':'bpr_games','Unit':'bpr_unit'},axis='columns',inplace=True)
# obsoleted by filter() - df_allpr.drop('player_id',axis='columns',inplace=True)
display(len(df_allpr),df_allpr.info(),df_allpr.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4855 entries, 0 to 4854
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bpr_rank       4855 non-null   float32
 1   bpr_pr         4855 non-null   float32
 2   bpr_games      4855 non-null   float32
 3   bpr_unit       4855 non-null   string 
 4   Name           4855 non-null   string 
 5   player_id      4855 non-null   string 
 6   player_number  4823 non-null   string 
dtypes: float32(3), string(4)
memory usage: 246.5 KB


4855

None

Unnamed: 0,bpr_rank,bpr_pr,bpr_games,bpr_unit,Name,player_id,player_number
0,1.0,72.17,137.0,108,"Grossack, Zachary",[2250896],2250896
1,2.0,71.19,60.0,174,"Wold, Eddie",['3443949'],3443949
2,3.0,71.15,86.0,373,"Shi, Sylvia",[5420105],5420105
3,4.0,71.03,164.0,128,"Cappelletti, Mike",['2601087'],2601087
4,5.0,71.02,201.0,538,"Itabashi, Mark",['6811434'],6811434


In [41]:
# takes 8s
# using filter() to remove unwanted columns: df_allpr.filter(regex='bpr_|player_number')
# merging using how='left', instead of 'inner', because bpr is 8% of total.
df = pd.merge(df_pair_summaries_players_awards_sections_sessions_events_strats_strat_place_stratValues,df_allpr.filter(regex='bpr_|player_number'),on='player_number',how='left')
bpr_cols = df.filter(regex='bpr_')
# todo: revisit. removing NA like this implies categorical? Needs astype('string') otherwise reverts to 'object'.
for cols in bpr_cols:
    df[cols] = df[cols].fillna('').astype('string')
display(len(df),df.info(),df.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5294888 entries, 0 to 5294887
Data columns (total 42 columns):
 #   Column                Dtype  
---  ------                -----  
 0   pair_summary_id       int64  
 1   section_id            int64  
 2   pair_number           int64  
 3   direction             string 
 4   strat                 Int8   
 5   percentage            float32
 6   is_eligible           boolean
 7   player_id             int64  
 8   player_number         string 
 9   player_name           string 
 10  mp_total              float32
 11  type                  string 
 12  rank                  string 
 13  session_id            int64  
 14  section_name          string 
 15  event_id              int64  
 16  hand_record_id        string 
 17  game_date             string 
 18  event_name            string 
 19  club_name             string 
 20  club_id_number        int64  
 21  event_type            string 
 22  rating                int8   
 23  board_s

5294888

None

Unnamed: 0,pair_summary_id,section_id,pair_number,direction,strat,percentage,is_eligible,player_id,player_number,player_name,mp_total,type,rank,session_id,section_name,event_id,hand_record_id,game_date,event_name,club_name,club_id_number,event_type,rating,board_scoring_method,stratification_type,winner_type,club_class,strat_id,strats_number,strats_label,strats_limit,strat_place_id,strat_place_number,strat_place_rank,strat_place_type,stratValues_id,stratValues_label,stratValues_limit,bpr_rank,bpr_pr,bpr_games,bpr_unit
0,1848542,119248,4,,1,62.5,True,3695653,8052557,"Robson, Dave",693.89,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997656],[1],[1],[Section],[100961-1],[A],[0],,,,
1,1848542,119248,4,,1,62.5,True,3695652,tmp:1de5c0e8-a5a2-4433-a232-caa798c84939,"Gamache, Len(swap names)",19.0,section,1,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997656],[1],[1],[Section],[100961-1],[A],[0],,,,
2,1848539,119248,1,,1,57.29,True,3695646,tmp:1238a3ab-2652-4f75-921b-767aae646b2a,"Steele, Bruce",16.0,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997654],[1],[2],[Section],[100961-1],[A],[0],,,,
3,1848539,119248,1,,1,57.29,True,3695647,tmp:9f01395a-c179-4283-adf3-886ff6aed8aa,"Hlady, Henny",,section,2,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997654],[1],[2],[Section],[100961-1],[A],[0],,,,
4,1848536,119248,3,,1,51.04,True,3695641,5637562,"Williams, Peter",139.21,section,3/4,100961,M,100961,77908,2020-02-17 00:00:00,Monday Evening Pairs,Cariboo Bridge Club 1,100040,PAIRS,1,MATCH_POINTS,HIGHEST,1,4,[274629],[1],[A],[0],[1997653],[1],[3/4],[Section],[100961-1],[A],[0],,,,


### End of Merges

In [42]:
# takes 4s
len(df),df.isna().sum()

(5294888,
 pair_summary_id              0
 section_id                   0
 pair_number                  0
 direction               452809
 strat                        0
 percentage                6954
 is_eligible             870010
 player_id                    0
 player_number                0
 player_name                  0
 mp_total                 69445
 type                         0
 rank                         0
 session_id                   0
 section_name                 0
 event_id                     0
 hand_record_id           58495
 game_date                    0
 event_name                   0
 club_name                    0
 club_id_number               0
 event_type                   0
 rating                       0
 board_scoring_method         0
 stratification_type          0
 winner_type                  0
 club_class                   0
 strat_id                     0
 strats_number                0
 strats_label                 0
 strats_limit                 

In [43]:
df.value_counts('strat')

strat
1    1870314
2    1758347
3    1666227
dtype: int64

In [44]:
len(df),df.value_counts('rank'),df['rank'].isna().sum(),df['rank'].isna().sum()/len(df)

(5294888,
 rank
 2        1527079
 1        1283192
 3        1102506
 4         643912
 5         213601
 2/3       120718
 3/4        93454
 6          92750
 1/2        72765
 4/5        51405
 5/6        19819
 7          17825
 8           9256
 6/7         9038
 2/4         7167
 3/5         6018
 9           4494
 1/3         3852
 4/6         3441
 10          2656
 7/8         1613
 5/7         1386
 11          1139
 8/9          867
 12           743
           ...   
 21            18
 20            17
 7/10          16
 8/11          14
 11/13         12
 2/6           10
 9/12           9
 23             8
 11/14          8
 16/17          8
 5/9            8
 22             7
 19             7
 12/14          6
 31             4
 25             4
 26             4
 27             4
 28             4
 29             4
 19/20          3
 15/17          3
 13/15          2
 30             2
 24             2
 Length: 74, dtype: int64,
 0,
 0.0)

In [45]:
df.value_counts('type')

type
section       4341640
ssoveralls     953248
dtype: int64

In [46]:
df.value_counts('is_eligible')

is_eligible
True     4405917
False      18961
dtype: int64

In [47]:
# takes 20s
pair_summaries_cleaned_filename = 'pair_summaries_cleaned.pkl'
pair_summaries_cleaned_file = acblPath.joinpath(pair_summaries_cleaned_filename)
with open(pair_summaries_cleaned_file, 'wb') as f:
    pickle.dump(df, f)
print(f"Saved {pair_summaries_cleaned_filename}: len:{len(df)} size:{pair_summaries_cleaned_file.stat().st_size}")

Saved pair_summaries_cleaned.pkl: len:5294888 size:1422277122
