diff --git a/acbllib/acbllib.py b/acbllib/acbllib.py index 1a3ca64..5b82535 100644 --- a/acbllib/acbllib.py +++ b/acbllib/acbllib.py @@ -1,5 +1,16 @@ # functions which are specific to acbl; downloading acbl webpages, api calls. +import logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # or DEBUG +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +def print_to_log_info(*args): + print_to_log(logging.INFO, *args) +def print_to_log_debug(*args): + print_to_log(logging.DEBUG, *args) +def print_to_log(level, *args): + logging.log(level, ' '.join(str(arg) for arg in args)) + import pandas as pd import re import traceback @@ -30,22 +41,22 @@ def get_club_results(cns, base_url, acbl_url, acblPath, read_local): ncn += 1 url = base_url+str(cn)+'/' file = url.replace(acbl_url,'')+str(cn)+'.html' - print(f'Processing file ({ncn}/{total_clubs}): {file}') + print_to_log_info(f'Processing file ({ncn}/{total_clubs}): {file}') path = acblPath.joinpath(file) if read_local and path.exists() and path.stat().st_size > 200: html = path.read_text(encoding="utf-8") - print(f'Reading local {file}: len={len(html)}') + print_to_log_info(f'Reading local {file}: len={len(html)}') else: - print(f'Requesting {url}') + print_to_log_info(f'Requesting {url}') try: r = requests.get(url,headers=headers) except: - print(f'Except: status:{r.status_code} {url}') + print_to_log_info(f'Except: status:{r.status_code} {url}') else: html = r.text - print(f'Creating {file}: len={len(html)}') + print_to_log_info(f'Creating {file}: len={len(html)}') if r.status_code != 200: - print(f'Error: status:{r.status_code} {url}') + print_to_log_info(f'Error: status:{r.status_code} {url}') time.sleep(60) # obsolete? failed_urls.append(url) continue @@ -54,8 +65,8 @@ def get_club_results(cns, base_url, acbl_url, acblPath, read_local): path.write_text(html, encoding="utf-8") time.sleep(1) # need to self-throttle otherwise acbl returns 403 "forbidden". obsolete? htmls[str(cn)] = html - print(f'Failed Urls: len:{len(failed_urls)} Urls:{failed_urls}') - print(f"Done: Total clubs processed:{total_clubs}: Total url failures:{len(failed_urls)}") + print_to_log_info(f'Failed Urls: len:{len(failed_urls)} Urls:{failed_urls}') + print_to_log_info(f"Done: Total clubs processed:{total_clubs}: Total url failures:{len(failed_urls)}") return htmls, total_clubs, failed_urls @@ -66,22 +77,22 @@ def extract_club_games(htmls, acbl_url): total_htmls = len(htmls) for n,(cn,html) in enumerate(htmls.items()): n += 1 - print(f'Processing club ({n}/{total_htmls}) {cn}') + print_to_log_info(f'Processing club ({n}/{total_htmls}) {cn}') bs = BeautifulSoup(html, "html.parser") # todo: do this only once. html_table = bs.find('table') if html_table is None: - print(f'Invalid club-result for {cn}') + print_to_log_info(f'Invalid club-result for {cn}') continue # /html/body/div[2]/div/div[2]/div[1]/div[2] ClubInfo = bs.find('div', 'col-md-8') - #print(ClubInfo) + #print_to_log(ClubInfo) ci = {} ci['Name'] = ClubInfo.find('h1').contents[0].strip() # get first text and strip ci['Location'] = ClubInfo.find('h5').contents[0].strip() # get first text and strip if ClubInfo.find('a'): ci['WebSite'] = ClubInfo.find('a')['href'] # get href of first a ClubInfos[cn] = ci - print(f'{ci}') + print_to_log_info(f'{ci}') # assumes first table is our target d = pd.read_html(str(html_table)) assert len(d) == 1 @@ -93,7 +104,7 @@ def extract_club_games(htmls, acbl_url): df['ResultID'] = [result.rsplit('/', 1)[-1] for result in hrefs] df['ResultUrl'] = hrefs dfs[cn] = df - print(f"Done: Total clubs processed:{len(dfs)}") + print_to_log_info(f"Done: Total clubs processed:{len(dfs)}") return dfs, ClubInfos @@ -106,7 +117,7 @@ def extract_club_result_json(dfs, filtered_clubs, starting_nclub, ending_nclub, headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"} for ndf,(kdf,df) in enumerate(filtered_clubs.items()): if ndf < starting_nclub or ndf >= ending_nclub: - print(f"Skipping club #{ndf} {kdf}") # obsolete when filtered_clubs works + print_to_log_info(f"Skipping club #{ndf} {kdf}") # obsolete when filtered_clubs works continue ndf += 1 except_count = 0 @@ -116,7 +127,7 @@ def extract_club_result_json(dfs, filtered_clubs, starting_nclub, ending_nclub, total_urls_processed += 1 html_file = url.replace(acbl_url,'').replace('club-results','club-results/'+str(cn))+'.html' json_file = html_file.replace('.html','.data.json') - print(f'Processing club ({ndf}/{total_clubs}): result file ({nurl}/{total_results}): {html_file}') + print_to_log_info(f'Processing club ({ndf}/{total_clubs}): result file ({nurl}/{total_results}): {html_file}') #if ndf < 1652: # continue html_path = acblPath.joinpath(html_file) @@ -125,38 +136,38 @@ def extract_club_result_json(dfs, filtered_clubs, starting_nclub, ending_nclub, data_json = None if read_local and json_path.exists(): #if html_path.exists(): - # print(f'Found local html file: {html_file}') + # print_to_log(f'Found local html file: {html_file}') #else: - # print(f'Missing local html file: {html_file}') + # print_to_log(f'Missing local html file: {html_file}') try: with open(json_path, 'r') as f: data_json = json.load(f) except: - print(f'Exception when reading json file: {json_file}. Deleting html and json files.') + print_to_log_info(f'Exception when reading json file: {json_file}. Deleting html and json files.') else: total_local_files_read += 1 - print(f'Reading local ({total_local_files_read}/{total_local_files}) file:{json_path}: len:{json_path.stat().st_size}') + print_to_log_info(f'Reading local ({total_local_files_read}/{total_local_files}) file:{json_path}: len:{json_path.stat().st_size}') else: - print(f'Requesting {url}') + print_to_log_info(f'Requesting {url}') try: r = requests.get(url,headers=headers) except Exception as ex: - print(f'Exception: count:{except_count} type:{type(ex).__name__} args:{ex.args}') + print_to_log_info(f'Exception: count:{except_count} type:{type(ex).__name__} args:{ex.args}') if except_count > 5: - print('Except count exceeded') + print_to_log_info('Except count exceeded') break # skip url except_count += 1 time.sleep(1) # just in case the exception is transient continue # retry url except KeyboardInterrupt as e: - print(f"Error: {type(e).__name__} while processing file:{url}") - print(traceback.format_exc()) + print_to_log_info(f"Error: {type(e).__name__} while processing file:{url}") + print_to_log_info(traceback.format_exc()) canceled = True break else: except_count = 0 html = r.text - print(f'Creating {html_file}: len={len(html)}') + print_to_log_info(f'Creating {html_file}: len={len(html)}') # some clubs return 200 (ok) but with instructions to login (len < 200). # skip clubs returning errors or tiny files. assumes one failed club result will be true for all club's results. if r.status_code != 200 or len(html) < 200: @@ -167,27 +178,27 @@ def extract_club_result_json(dfs, filtered_clubs, starting_nclub, ending_nclub, html_path.write_text(html, encoding="utf-8") bs = BeautifulSoup(html, "html.parser") scripts = bs.find_all('script') - #print(scripts) + #print_to_log(scripts) for script in scripts: if script.string: # not defined for all scripts - #print(script.string) + #print_to_log(script.string) vardata = re.search('var data = (.*);\n', script.string) if vardata: data_json = json.loads(vardata.group(1)) - #print(json.dumps(data_json, indent=4)) - print(f"Writing {json_path}") + #print_to_log(json.dumps(data_json, indent=4)) + print_to_log_info(f"Writing {json_path}") with open(json_path, 'w') as f: json.dump(data_json, f, indent=2) bbo_tournament_id = data_json["bbo_tournament_id"] - print(f'bbo_tournament_id: {bbo_tournament_id}') + print_to_log_info(f'bbo_tournament_id: {bbo_tournament_id}') #time.sleep(1) # obsolete? # if no data_json file read, must be an error so delete both html and json files. if not data_json: html_path.unlink(missing_ok=True) json_path.unlink(missing_ok=True) - #print(f'Files processed ({total_urls_processed}/{total_local_files_read}/{total_urls_to_process})') - print(f'Failed Urls: len:{len(failed_urls)} Urls:{failed_urls}') - print(f"Done: Totals: clubs:{total_clubs} urls:{total_urls_processed} local files read:{total_local_files_read}: failed urls:{len(failed_urls)}") + #print_to_log(f'Files processed ({total_urls_processed}/{total_local_files_read}/{total_urls_to_process})') + print_to_log_info(f'Failed Urls: len:{len(failed_urls)} Urls:{failed_urls}') + print_to_log_info(f"Done: Totals: clubs:{total_clubs} urls:{total_urls_processed} local files read:{total_local_files_read}: failed urls:{len(failed_urls)}") return total_urls_processed, total_local_files_read, failed_urls @@ -210,18 +221,18 @@ def club_results_json_to_sql(urls, starting_nfile=0, ending_nfile=0, initially_d #r = requests.get(url) json_file = url sql_file = url.with_suffix('.sql') - print(f"Processing ({nfile}/{total_urls}): file:{json_file.as_posix()}") + print_to_log_info(f"Processing ({nfile}/{total_urls}): file:{json_file.as_posix()}") if skip_existing_files: if sql_file.exists(): - print(f"Skipping: File exists:{sql_file.as_posix()}") + print_to_log_info(f"Skipping: File exists:{sql_file.as_posix()}") continue try: data_json = None with open(json_file, 'r') as f: data_json = json.load(f) - #print(f"Reading {json_file.as_posix()} dict len:{len(data_json)}") + #print_to_log(f"Reading {json_file.as_posix()} dict len:{len(data_json)}") if len(event_types) > 0 and data_json['type'] not in event_types: - #print(f"Skipping type:{data_json['type']}: file{json_file.as_posix()}") + #print_to_log(f"Skipping type:{data_json['type']}: file{json_file.as_posix()}") continue tables = defaultdict(lambda :defaultdict(dict)) primary_keys = ['id'] @@ -230,11 +241,11 @@ def club_results_json_to_sql(urls, starting_nfile=0, ending_nfile=0, initially_d mlBridgeLib.CreateSqlFile(tables,f,primary_keys) total_files_written += 1 except Exception as e: - print(f"Error: {e}: type:{data_json['type']} file:{url.as_posix()}") + print_to_log_info(f"Error: {e}: type:{data_json['type']} file:{url.as_posix()}") else: - print(f"Writing: type:{data_json['type']} file:{sql_file.as_posix()}") + print_to_log_info(f"Writing: type:{data_json['type']} file:{sql_file.as_posix()}") - print(f"All files processed:{total_urls} files written:{total_files_written} total time:{round(time.time()-start_time,2)}") + print_to_log_info(f"All files processed:{total_urls} files written:{total_files_written} total time:{round(time.time()-start_time,2)}") return total_urls, total_files_written @@ -246,11 +257,11 @@ def club_results_create_sql_db(db_file_connection_string, create_tables_sql_file db_connection_string = db_memory_connection_string # memory based db if delete_db and sqlalchemy_utils.functions.database_exists(db_file_connection_string): - print(f"Deleting db:{db_file_connection_string}") + print_to_log_info(f"Deleting db:{db_file_connection_string}") sqlalchemy_utils.functions.drop_database(db_file_connection_string) # warning: can't delete file if in use by another app (restart kernel). if not sqlalchemy_utils.functions.database_exists(db_connection_string): - print(f"Creating db:{db_connection_string}") + print_to_log_info(f"Creating db:{db_connection_string}") sqlalchemy_utils.functions.create_database(db_connection_string) create_tables = True @@ -258,7 +269,7 @@ def club_results_create_sql_db(db_file_connection_string, create_tables_sql_file raw_connection = engine.raw_connection() if create_tables: - print(f"Creating tables from:{create_tables_sql_file}") + print_to_log_info(f"Creating tables from:{create_tables_sql_file}") with open(create_tables_sql_file, 'r') as f: create_sql = f.read() raw_connection.executescript(create_sql) # create tables @@ -284,7 +295,7 @@ def club_results_create_sql_db(db_file_connection_string, create_tables_sql_file nfile += 1 sql_file = url if (nfile % 100) == 0: - print(f"Executing SQL script ({nfile}/{total_filtered_urls}): file:{sql_file.as_posix()}") + print_to_log_info(f"Executing SQL script ({nfile}/{total_filtered_urls}): file:{sql_file.as_posix()}") try: sql_script = None @@ -293,53 +304,53 @@ def club_results_create_sql_db(db_file_connection_string, create_tables_sql_file start_script_time = time.time() raw_connection.executescript(sql_script) except Exception as e: - print(f"Error: {type(e).__name__} while processing file:{url.as_posix()}") - print(traceback.format_exc()) - print(f"Every json field must be an entry in the schema file. Update schema if needed.") - print(f"Removing {url.as_posix()}") + print_to_log_info(f"Error: {type(e).__name__} while processing file:{url.as_posix()}") + print_to_log_info(traceback.format_exc()) + print_to_log_info(f"Every json field must be an entry in the schema file. Update schema if needed.") + print_to_log_info(f"Removing {url.as_posix()}") sql_file.unlink(missing_ok=True) # delete any bad files, fix issues, rerun. continue # todo: log error. #break except KeyboardInterrupt as e: - print(f"Error: {type(e).__name__} while processing file:{url.as_posix()}") - print(traceback.format_exc()) + print_to_log_info(f"Error: {type(e).__name__} while processing file:{url.as_posix()}") + print_to_log_info(traceback.format_exc()) canceled = True break else: script_execution_time = time.time()-start_script_time if (nfile % 100) == 0: - print(f"SQL script executed: file:{url.as_posix()}: time:{round(script_execution_time,2)}") + print_to_log_info(f"SQL script executed: file:{url.as_posix()}: time:{round(script_execution_time,2)}") total_script_execution_time += script_execution_time total_scripts_executed += 1 - print(f"SQL scripts executed ({total_scripts_executed}/{total_filtered_urls}/{len(urls)}): total changes:{raw_connection.total_changes} total script execution time:{round(time.time()-start_time,2)}: avg script execution time:{round(total_script_execution_time/max(1,total_scripts_executed),2)}") + print_to_log_info(f"SQL scripts executed ({total_scripts_executed}/{total_filtered_urls}/{len(urls)}): total changes:{raw_connection.total_changes} total script execution time:{round(time.time()-start_time,2)}: avg script execution time:{round(total_script_execution_time/max(1,total_scripts_executed),2)}") # if using memory db, write memory db to disk file. if not canceled: if perform_integrity_checks: # todo: research how to detect and display failures? Which checks are needed? - print(f"Performing quick_check on file") + print_to_log_info(f"Performing quick_check on file") raw_connection.execute("PRAGMA quick_check;") # takes 7m on disk - print(f"Performing foreign_key_check on file") + print_to_log_info(f"Performing foreign_key_check on file") raw_connection.execute("PRAGMA foreign_key_check;") # takes 3m on disk - print(f"Performing integrity_check on file") + print_to_log_info(f"Performing integrity_check on file") raw_connection.execute("PRAGMA integrity_check;") # takes 25m on disk if not write_direct_to_disk: - print(f"Writing memory db to file (takes 1+ hours):{db_file_connection_string}") + print_to_log_info(f"Writing memory db to file (takes 1+ hours):{db_file_connection_string}") engine_file = sqlalchemy.create_engine(db_file_connection_string) raw_connection_file = engine_file.raw_connection() raw_connection.backup(raw_connection_file.connection) # takes 45m raw_connection_file.close() engine_file.dispose() - print(f"Saved {db_file_path}: size:{db_file_path.stat().st_size}") + print_to_log_info(f"Saved {db_file_path}: size:{db_file_path.stat().st_size}") raw_connection.close() engine.dispose() - print("Done.") + print_to_log_info("Done.") return total_scripts_executed # not sure if any return value is needed. def get_club_results_details_data(url): - print('details url:',url) + print_to_log_info('details url:',url) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) assert response.status_code == 200, [url, response.status_code] @@ -363,7 +374,7 @@ def get_club_results_details_data(url): def get_club_results_from_acbl_number(acbl_number): url = f"https://my.acbl.org/club-results/my-results/{acbl_number}" - print('my-results url:',url) + print_to_log_info('my-results url:',url) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) assert response.status_code == 200, [url, response.status_code] @@ -395,7 +406,7 @@ def get_tournament_session_results(session_id, acbl_api_key): query = {'id':session_id,'full_monty':1} params = urllib.parse.urlencode(query) url = path+'?'+params - print('tournament session url:',url) + print_to_log_info('tournament session url:',url) response = requests.get(url, headers=headers) return response @@ -421,28 +432,28 @@ def download_tournament_player_history(player_id, acbl_api_key): try: response = requests.get(url, headers=headers) except Exception as ex: - print(f'Exception: count:{except_count} type:{type(ex).__name__} args:{ex.args}') + print_to_log_info(f'Exception: count:{except_count} type:{type(ex).__name__} args:{ex.args}') if except_count > 5: - print('Except count exceeded') + print_to_log_info('Except count exceeded') break # skip url except_count += 1 time.sleep(1) # just in case the exception is transient continue # retry url except KeyboardInterrupt as e: - print(f"Error: {type(e).__name__} while processing file:{url}") - print(traceback.format_exc()) + print_to_log_info(f"Error: {type(e).__name__} while processing file:{url}") + print_to_log_info(traceback.format_exc()) return None else: except_count = 0 if response.status_code in [400,500,504]: # 500 is unknown response code. try skipping player - print(f'Status Code:{response.status_code}: count:{len(json_responses)} skipping') # 4476921 - Thx Merle. + print_to_log_info(f'Status Code:{response.status_code}: count:{len(json_responses)} skipping') # 4476921 - Thx Merle. # next_page_url = None # sessions_total = 0 break assert response.status_code == 200, (url, response.status_code) # 401 is authorization error often because Personal Access Token has expired. json_response = response.json() #json_pretty = json.dumps(json_response, indent=4) - #print(json_pretty) + #print_to_log(json_pretty) json_responses.append(json_response) url = json_response['next_page_url'] return path, json_responses @@ -455,21 +466,21 @@ def download_tournament_players_history(player_ids, acbl_api_key, dirPath): #canceled = False for n,player_id in enumerate(sorted(player_ids)): if player_id.startswith('tmp:') or player_id.startswith('#'): # somehow #* crept into player_id - print(f'Skipping player_id:{player_id}') + print_to_log_info(f'Skipping player_id:{player_id}') continue else: - print(f'Processing player_id:{player_id}') + print_to_log_info(f'Processing player_id:{player_id}') if dirPath.exists(): session_file_count = len(list(dirPath.glob('*.session.json'))) - print(f'dir exists: file count:{session_file_count} dir:{dirPath}') + print_to_log_info(f'dir exists: file count:{session_file_count} dir:{dirPath}') #if session_file_count == 0: # todo: ignore players who never played a tournament? - # print(f'dir empty -- skipping') + # print_to_log(f'dir empty -- skipping') # continue #if session_file_count > 0: # todo: temp? - # print(f'dir not empty -- skipping') + # print_to_log(f'dir not empty -- skipping') # continue else: - print(f'Creating dir:{dirPath}') + print_to_log_info(f'Creating dir:{dirPath}') dirPath.mkdir(parents=True,exist_ok=True) session_file_count = 0 url, json_responses = download_tournament_player_history(player_id, acbl_api_key) @@ -478,13 +489,13 @@ def download_tournament_players_history(player_ids, acbl_api_key, dirPath): get_count = len(json_responses) if get_count == 0: # skip player_id's generating errors. e.g. player_id 5103045, 5103045, 5103053 continue - print(f"{n}/{len(player_ids)} gets:{get_count} rate:{round((time.time()-start_time)/get_count,2)} {player_id=}") + print_to_log_info(f"{n}/{len(player_ids)} gets:{get_count} rate:{round((time.time()-start_time)/get_count,2)} {player_id=}") #time.sleep(1) # throttle api calling. Maybe not needed as api is taking longer than 1s. sessions_count = 0 for json_response in json_responses: sessions_total = json_response['total'] # is same for every page if sessions_total == session_file_count: # sometimes won't agree because identical sessions. revised results? - print(f'File count correct: {dirPath}: terminating {player_id} early.') + print_to_log_info(f'File count correct: {dirPath}: terminating {player_id} early.') sessions_count = sessions_total break for data in json_response['data']: @@ -493,20 +504,20 @@ def download_tournament_players_history(player_ids, acbl_api_key, dirPath): filePath_sql = dirPath.joinpath(session_id+'.session.sql') filePath_json = dirPath.joinpath(session_id+'.session.json') if filePath_sql.exists() and filePath_json.exists() and filePath_sql.stat().st_ctime > filePath_json.stat().st_ctime: - print(f'{sessions_count}/{sessions_total}: File exists: {filePath_sql}: skipping') + print_to_log_info(f'{sessions_count}/{sessions_total}: File exists: {filePath_sql}: skipping') #if filePath_json.exists(): # json file is no longer needed? - # print(f'Deleting JSON file: {filePath_json}') + # print_to_log(f'Deleting JSON file: {filePath_json}') # filePath_json.unlink(missing_ok=True) break # continue will skip file. break will move on to next player if filePath_json.exists(): - print(f'{sessions_count}/{sessions_total}: File exists: {filePath_json}: skipping') + print_to_log_info(f'{sessions_count}/{sessions_total}: File exists: {filePath_json}: skipping') break # continue will skip file. break will move on to next player response = get_tournament_session_results(session_id, acbl_api_key) assert response.status_code == 200, response.status_code session_json = response.json() #json_pretty = json.dumps(json_response, indent=4) - print(f'{sessions_count}/{sessions_total}: Writing:{filePath_json} len:{len(session_json)}') + print_to_log_info(f'{sessions_count}/{sessions_total}: Writing:{filePath_json} len:{len(session_json)}') with open(filePath_json,'w',encoding='UTF8') as f: f.write(json.dumps(session_json, indent=4)) if sessions_count != sessions_total: - print(f'Session count mismatch: {dirPath}: variance:{sessions_count-sessions_total}') + print_to_log_info(f'Session count mismatch: {dirPath}: variance:{sessions_count-sessions_total}') diff --git a/app.py b/app.py index da15f57..863103a 100644 --- a/app.py +++ b/app.py @@ -6,9 +6,15 @@ #!pip install openai python-dotenv pandas --quiet import logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # or DEBUG logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -def print_to_log(*args): - logging.info(' '.join(str(arg) for arg in args)) +def print_to_log_info(*args): + print_to_log(logging.INFO, *args) +def print_to_log_debug(*args): + print_to_log(logging.DEBUG, *args) +def print_to_log(level, *args): + logging.log(level, ' '.join(str(arg) for arg in args)) import sys from collections import defaultdict @@ -73,14 +79,14 @@ async def create_chat_completion(messages, model=DEFAULT_AI_MODEL, functions=Non def ask_database(query): - print_to_log('ask_database query:', query) + print_to_log_info('ask_database query:', query) conn = st.session_state.conn #"""Function to query duckdb database with a provided SQL query.""" try: results = conn.execute(query) except Exception as e: results = f"query failed with error: {e}" - print_to_log('ask_database: results:', results) + print_to_log_info('ask_database: results:', results) return results @@ -163,18 +169,18 @@ async def async_chat_up_user(prompt_sql, messages, function_calls, model=None): for original, new in replacement_strings: enhanced_prompt = re.sub(original, new, enhanced_prompt.replace( ' ', ' '), flags=re.IGNORECASE) - print_to_log('enhanced_prompt:', enhanced_prompt) + print_to_log_info('enhanced_prompt:', enhanced_prompt) # add enhanced prompt to messages messages.append({"role": "user", "content": enhanced_prompt}) # request chat completion of user message chat_response = await create_chat_completion( # chat_completion_request( messages, model, function_calls) # chat's response from user input - print_to_log('chat_response status:', type(chat_response), chat_response) + print_to_log_info('chat_response status:', type(chat_response), chat_response) chat_response_json = json.loads(chat_response.model_dump_json()) # create_chat_completion returns json directly - print_to_log('chat_response_json:', type(chat_response_json), chat_response_json) - print_to_log('chat_response_json id:', type(chat_response_json['id']), chat_response_json['id']) - print_to_log('chat_response_json choices:', type(chat_response_json['choices']), chat_response_json['choices']) + print_to_log_info('chat_response_json:', type(chat_response_json), chat_response_json) + print_to_log_info('chat_response_json id:', type(chat_response_json['id']), chat_response_json['id']) + print_to_log_info('chat_response_json choices:', type(chat_response_json['choices']), chat_response_json['choices']) # restore original user prompt messages[-1] = {"role": "user", "content": up} @@ -196,7 +202,7 @@ async def async_chat_up_user(prompt_sql, messages, function_calls, model=None): {"role": "assistant", "content": f"Unexpected response from {model} (missing message). Try again later."}) return False assistant_message = first_choice['message'] - print_to_log('assistant_message:', assistant_message) + print_to_log_info('assistant_message:', assistant_message) if 'role' not in assistant_message or assistant_message['role'] != 'assistant': # fake message messages.append( @@ -214,7 +220,7 @@ async def async_chat_up_user(prompt_sql, messages, function_calls, model=None): function_call_json = json.loads( assistant_message["message"]["content"].replace('\n','')) # rarely, but sometimes, there are newlines in the json. except Exception as e: - print_to_log(f"Exception: Invalid JSON. Error: {e}") + print_to_log_info(f"Exception: Invalid JSON. Error: {e}") # fake message messages.append( {"role": "assistant", "content": f"Invalid JSON. Error: {e}"}) @@ -256,7 +262,7 @@ async def async_chat_up_user(prompt_sql, messages, function_calls, model=None): function_call_json = json.loads( assistant_message["function_call"]["arguments"].replace('\n','')) # rarely, but sometimes, there are newlines in the json. except Exception as e: - print_to_log(f"Exception: Invalid JSON. Error: {e}") + print_to_log_info(f"Exception: Invalid JSON. Error: {e}") # fake message messages.append( {"role": "assistant", "content": f"Invalid JSON. Error: {e}"}) @@ -269,7 +275,7 @@ async def async_chat_up_user(prompt_sql, messages, function_calls, model=None): # todo: execute via function call, not explicitly ask_database_results = ask_database(sql_query) - print_to_log('ask_database_results:', ask_database_results) + print_to_log_info('ask_database_results:', ask_database_results) if not isinstance(ask_database_results, duckdb.DuckDBPyConnection): # fake message messages.append( @@ -286,7 +292,7 @@ async def async_chat_up_user(prompt_sql, messages, function_calls, model=None): messages.append({"role": "assistant", "content": sql_query}) prompt_sql['sql'] = sql_query # will always return same sql for same query from now on. Is this what we want? - print_to_log('prompt_sql:', prompt_sql) + print_to_log_info('prompt_sql:', prompt_sql) return True @@ -340,7 +346,7 @@ def create_schema_string(df, conn): df_dtypes_d[col] = dtype_name dtypes_d[dtype_name].append(col) for obj in complex_objects: - print_to_log(str(obj), df[obj].iloc[0]) + print_to_log_debug(str(obj), df[obj].iloc[0]) df.drop(columns=complex_objects, inplace=True) # warning: fake sql CREATE TABLE because types are dtypes not sql types. @@ -379,11 +385,12 @@ def create_schema_string(df, conn): def chat_initialize(player_number, session_id): # todo: rename to session_id? - print_to_log(f"Retrieving latest results for {player_number}") + print_to_log_info(f"Retrieving latest results for {player_number}") conn = st.session_state.conn with st.spinner(f"Retrieving a list of games for {player_number} ..."): + t = time.time() game_urls = get_club_results_from_acbl_number(player_number) if game_urls is None: st.error(f"Player number {player_number} not found.") @@ -392,8 +399,10 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.error(f"Could not find any club games for {player_number}.") elif session_id is None: session_id = list(game_urls.keys())[0] # default to most recent club game + print_to_log_info('get_club_results_from_acbl_number time:', time.time()-t) # takes 4s with st.spinner(f"Retrieving a list of tournament sessions for {player_number} ..."): + t = time.time() tournament_session_urls = get_tournament_sessions_from_acbl_number(player_number, acbl_api_key) # returns [url, url, description, dfs] if tournament_session_urls is None: st.error(f"Player number {player_number} not found.") @@ -402,6 +411,7 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.error(f"Could not find any tournament sessions for {player_number}.") elif session_id is None: session_id = list(tournament_session_urls.keys())[0] # default to most recent tournament session + print_to_log_info('get_tournament_sessions_from_acbl_number time:', time.time()-t) # takes 2s if session_id is None: st.error(f"Could not find any club or tournament sessions for {player_number}.") @@ -414,13 +424,14 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? if session_id in game_urls: with st.spinner(f"Collecting data for club game {session_id} and player {player_number}. Might take a minute ..."): + t = time.time() # game_urls[session_id][1] is detail_url dfs = create_club_dfs(player_number, game_urls[session_id][1]) if dfs is None or 'event' not in dfs or len(dfs['event']) == 0: st.error( f"Game {session_id} has missing or invalid game data. Select a different club game or tournament session from left sidebar.") return False - print_to_log(dfs.keys()) + print_to_log_info('dfs:',dfs.keys()) # todo: probably need to check if keys exist to control error processing -- pair_summaries, event, sessions, ... @@ -443,13 +454,16 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.error( f"Game {session_id} is {dfs['sessions']['hand_record_id'].iloc[0]}. Expecting a valid hand record number. Select a different club game or tournament session from left sidebar.") return False + print_to_log_info('create_club_dfs time:', time.time()-t) # takes 3s - #with Profiler(): + with st.spinner(f"Processing data for club game: {session_id} and player {player_number}. Takes 30 seconds ..."): + t = time.time() df, sd_cache_d, matchpoint_ns_d = merge_clean_augment_club_dfs(dfs, {}, player_number) # doesn't use any caching if df is None: st.error( f"Game {session_id} has an invalid game file. Select a different club game or tournament session from left sidebar.") return False + print_to_log_info('merge_clean_augment_club_dfs time:', time.time()-t) # takes 30s elif session_id in tournament_session_urls: dfs = tournament_session_urls[session_id][3] @@ -457,7 +471,7 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.error( f"Session {session_id} has missing or invalid session data. Choose another session.") return False - print_to_log(dfs.keys()) + print_to_log_info(dfs.keys()) if dfs['event']['game_type'] != 'Pairs': st.error( @@ -470,6 +484,7 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? return False with st.spinner(f"Collecting data for tournament session {session_id} and player {player_number} from ACBL."): + t = time.time() response = get_tournament_session_results(session_id, acbl_api_key) assert response.status_code == 200, response.status_code @@ -478,7 +493,7 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.error( f"Session {session_id} has an invalid tournament session file. Choose another session.") return False - print_to_log(dfs_results.keys()) + print_to_log_info('dfs_results:',dfs_results.keys()) if len(dfs_results['sections']) == 0: st.error( @@ -501,8 +516,10 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.error( f"Session {session_id} section {section['section_label']} is {section['movement_type']}. I can only chat about Mitchell movements. Choose another session.") return False + print_to_log_info('get_tournament_session_results time:', time.time()-t) with st.spinner(f"Creating data table of tournament session {session_id} for player {player_number}. Might take a minute ..."): + t = time.time() #with Profiler(): df, sd_cache_d, matchpoint_ns_d = merge_clean_augment_tournament_dfs(tournament_session_urls[session_id][3], dfs_results, acbl_api_key, player_number) # doesn't use any caching @@ -510,6 +527,7 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.error( f"Session {session_id} has an invalid tournament session file. Choose another session.") return False + print_to_log_info('merge_clean_augment_tournament_dfs time:', time.time()-t) st.session_state.dfs_results = dfs_results else: @@ -519,6 +537,7 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? st.session_state.session_id = session_id with st.spinner(f"Creating everything data table."): + t = time.time() results = ask_database(st.session_state.commands_sql) assert isinstance(results, duckdb.DuckDBPyConnection), results df = results.df() # update df with results of SQL query. @@ -598,14 +617,18 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? df['Boards_Opponent_Declared'] = df['My_Pair'] & ~df['Number_Declarer'].isin( [st.session_state.player_number, st.session_state.partner_number]) # boolean break + print_to_log_info('create everything data table time:', time.time()-t) - # make predictions + # make predictions + with st.spinner(f"Making AI Predictions."): + t = time.time() Predict_Game_Results() + print_to_log_info('Predict_Game_Results time:', time.time()-t) # takes 10s - # Create a DuckDB table from the DataFrame - # register df as a table named 'results' for duckdb discovery. SQL queries will reference this df/table. - conn.register('results', df) - #conn.register('my_table', df) + # Create a DuckDB table from the DataFrame + # register df as a table named 'results' for duckdb discovery. SQL queries will reference this df/table. + conn.register('results', df) + #conn.register('my_table', df) st.session_state.df_schema_string = create_schema_string(df, conn) # temp @@ -643,9 +666,11 @@ def chat_initialize(player_number, session_id): # todo: rename to session_id? #streamlit_chat.message(f"Morty: {content}", logo=st.session_state.assistant_logo) if st.session_state.show_sql_query: + t = time.time() streamlit_chat.message( f"Morty: Here's a dataframe of game results. There's {len(df)} rows and {len(df.columns)} columns.", logo=st.session_state.assistant_logo) streamlitlib.ShowDataFrameTable(df, key='clear_conversation_game_data_df') + print_to_log_info('ShowDataFrameTable time:', time.time()-t) return True @@ -691,6 +716,7 @@ def ask_questions_without_context(ups, model=None): def ask_a_question_with_context(ups, model=None): + t = time.time() if model is None: model = st.session_state.ai_api messages = st.session_state.messages @@ -706,6 +732,7 @@ def ask_a_question_with_context(ups, model=None): with st.spinner(f"Waiting for response from {model}."): chat_up_user(ups, messages, function_calls, model) st.session_state.messages = messages + print_to_log_info('ask_a_question_with_context time:', time.time()-t) def reset_messages(): @@ -822,7 +849,7 @@ def Predict_Game_Results(): return None # todo: not needed right now. However, need to change *_augment.ipynb to output ParScore_MPs_(NS|EW) st.session_state.df['ParScore_MPs'] = st.session_state.df['ParScore_MPs_NS'] learn = mlBridgeAi.load_model(predicted_contracts_model_file) - print_to_log(st.session_state.df.isna().sum()) + print_to_log_debug('isna:',st.session_state.df.isna().sum()) #st.session_state.df['Contract'] = st.session_state.df['Contract'].str.replace(' ','').str.upper() # todo: should not be needed if cleaned # not needed here: st.session_state.df['Contract'] = st.session_state.df.apply(lambda r: 'PASS' if r['Contract'] == 'PASS' else r['Contract'].replace('NT','N')+r['Declarer_Direction'],axis='columns') # todo: assert that Contract equals BidLvl + BidSuit + Dbl + Declarer_Direction @@ -844,7 +871,7 @@ def Predict_Game_Results(): return None # todo: not needed right now. However, need to change *_augment.ipynb to output ParScore_MPs_(NS|EW) st.session_state.df['ParScore_MPs'] = st.session_state.df['ParScore_MPs_NS'] learn = mlBridgeAi.load_model(predicted_directions_model_file) - print_to_log(st.session_state.df.isna().sum()) + print_to_log_debug('isna:',st.session_state.df.isna().sum()) #st.session_state.df['Tricks'].fillna(.5,inplace=True) #st.session_state.df['Result'].fillna(.5,inplace=True) st.session_state.df['Declarer_Rating'].fillna(.5,inplace=True) # todo: NS sitout. Why is this needed? Are empty opponents required to have a declarer rating? Event id: 893775. @@ -868,7 +895,9 @@ def Predict_Game_Results(): st.error(f"Oops. {predicted_rankings_model_file} not found.") return None y_name = 'Pct_NS' - predicted_board_result_pcts_ns, _ = mlBridgeAi.make_predictions(predicted_rankings_model_file, st.session_state.df) + #predicted_board_result_pcts_ns, _ = mlBridgeAi.make_predictions(predicted_rankings_model_file, st.session_state.df) + learn = mlBridgeAi.load_model(predicted_rankings_model_file) + predicted_board_result_pcts_ns, _ = mlBridgeAi.get_predictions(learn, st.session_state.df) y_name_ns = y_name y_name_ew = y_name.replace('NS','EW') st.session_state.df[y_name_ns+'_Actual'] = st.session_state.df[y_name_ns] @@ -903,6 +932,8 @@ def reset_data(): # resets all data. used initially and when player number changes. # todo: put all session state into st.session_state.data so that clearing data clears all session states. + print_to_log_info('reset_data()') + # app #st.session_state.app_datetime = None st.session_state.help = None @@ -960,7 +991,7 @@ def reset_data(): st.session_state.section_name = None for k, v in st.session_state.items(): - print_to_log(k) + print_to_log_info('session_state:',k) if k.startswith('main_messages_df_'): # assert st.session_state[k] is None # This happened once on 29-Sep-2023. Not sure why. Maybe there's a timing issue with st.session_state and st.container being destroyed? #st.session_state[k].clear() # error: no such attribute as clear @@ -1002,6 +1033,8 @@ def app_info(): def create_sidebar(): + t = time.time() + st.sidebar.caption(st.session_state.app_datetime) st.sidebar.text_input( @@ -1126,9 +1159,11 @@ def create_sidebar(): st.number_input("Single Dummy Random Trials", min_value=1, max_value=100, value=st.session_state.sd_observations, on_change=sd_observations_changed, key='sd_observations_number_input') + print_to_log_info('create_sidebar time:', time.time()-t) def create_tab_bar(): + t = time.time() with st.container(): chat_tab, data, dtypes, schema, commands_sql, URLs, system_prompt_tab, favorites, help, release_notes, about, debug = st.tabs( @@ -1214,6 +1249,7 @@ def create_tab_bar(): st.header('Debug') st.write('Not yet implemented.') + print_to_log_info('create_tab_bar time:', time.time()-t) def create_main_section(): @@ -1223,6 +1259,7 @@ def create_main_section(): ask_a_question_with_context(user_content) # don't think DEFAULT_LARGE_AI_MODEL is needed? # output all messages except the initial system message. # only system message + t = time.time() if len(st.session_state.messages) == 1: # todo: put this message into config file. content = slash_about() @@ -1271,7 +1308,7 @@ def create_main_section(): continue match = re.match( r'```sql\n(.*)\n```', message['content']) - print_to_log('message content:', message['content'], match) + print_to_log_info('message content:', message['content'], match) if match is None: # for unknown reasons, the sql query is returned in 'content'. # hoping this is a SQL query @@ -1344,6 +1381,7 @@ def create_main_section(): # st.components.v1.html(js) streamlitlib.move_focus() + print_to_log_info('create_main_section time:', time.time()-t) def main(): diff --git a/chatlib/chatlib.py b/chatlib/chatlib.py index 0d80973..0d4a688 100644 --- a/chatlib/chatlib.py +++ b/chatlib/chatlib.py @@ -1,7 +1,13 @@ import logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # or DEBUG logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -def print_to_log(*args): - logging.info(' '.join(str(arg) for arg in args)) +def print_to_log_info(*args): + print_to_log(logging.INFO, *args) +def print_to_log_debug(*args): + print_to_log(logging.DEBUG, *args) +def print_to_log(level, *args): + logging.log(level, ' '.join(str(arg) for arg in args)) import pandas as pd from pprint import pprint # obsolete? @@ -56,24 +62,24 @@ def print_to_log(*args): # merge acbl json dicts into logically related dicts. dicts will be used to create dfs def json_dict_to_df(d,kl,jdl): - print_to_log(kl) + print_to_log_debug(kl) dd = {} d[kl] = dd assert not isinstance(jdl,dict) for i,jd in enumerate(jdl): for k,v in jd.items(): kkl = kl+(k,i) - print_to_log(i,kl,k,kkl) + print_to_log_debug(i,kl,k,kkl) #time.sleep(.00001) if isinstance(v,list): - print_to_log('\n',type(v),kkl,v) + print_to_log_debug('\n',type(v),kkl,v) json_dict_to_df(d,kkl,v) - print_to_log('list:',kkl,len(d[kkl])) + print_to_log_debug('list:',kkl,len(d[kkl])) elif isinstance(v,dict): #kkl = kl+(k,) - print_to_log('\n',type(v),kkl,v) + print_to_log_debug('\n',type(v),kkl,v) json_dict_to_df(d,kkl,[v]) - print_to_log('dict:',kkl,len(d[kkl])) + print_to_log_debug('dict:',kkl,len(d[kkl])) else: if k not in dd: dd[k] = [] @@ -88,7 +94,7 @@ def convert_to_best_dtype(k,v): vv = v.convert_dtypes(infer_objects=True) vvv = vv.copy() for col in vv.columns: - print_to_log(col,vvv[col].dtype) + print_to_log_debug(col,vvv[col].dtype) # todo: special cases. maybe should be done before writing to acbl_club_results.sqlite? if col in ['ns_score','ew_score']: vvv[col] = vvv[col].replace('PASS','0') @@ -97,25 +103,25 @@ def convert_to_best_dtype(k,v): elif col == 'tricks_taken': vvv[col] = vvv[col].replace('','0') if vvv[col].dtype == 'string' and vvv[col].notna().all() and vvv[col].ne('').all(): - print_to_log(f"String: {col}") + print_to_log_debug(f"String: {col}") try: if vvv[col].str.contains('.',regex=False).any(): - print_to_log(f"Trying to convert {col} to float") + print_to_log_debug(f"Trying to convert {col} to float") converted_values = pd.to_numeric(vvv[col], downcast='float', errors='raise') elif vvv[col].str.contains('-',regex=False).any(): - print_to_log(f"Trying to convert {col} to integer") + print_to_log_debug(f"Trying to convert {col} to integer") converted_values = pd.to_numeric(vvv[col], downcast='integer', errors='raise') else: - print_to_log(f"Trying to convert {col} to unsigned") + print_to_log_debug(f"Trying to convert {col} to unsigned") converted_values = pd.to_numeric(vvv[col], downcast='unsigned', errors='raise') vvv[col] = converted_values - print_to_log(f"Converted {col} to {vvv[col].dtype}") + print_to_log_debug(f"Converted {col} to {vvv[col].dtype}") except ValueError: - print_to_log(f"Can't convert {col} to float. Keeping as string") - print_to_log(f"dfs_dtype_conversions['{k}'] = "+'{') + print_to_log(logging.WARNING, f"Can't convert {col} to float. Keeping as string") + print_to_log_debug(f"dfs_dtype_conversions['{k}'] = "+'{') for col in vvv.columns: - print_to_log(f" '{col}':'{v[col].dtype},{vv[col].dtype},{vvv[col].dtype}',") - print_to_log("}\n") + print_to_log_debug(f" '{col}':'{v[col].dtype},{vv[col].dtype},{vvv[col].dtype}',") + print_to_log_debug("}\n") return vvv @@ -170,55 +176,55 @@ def create_club_dfs(acbl_number,event_url): def merge_clean_augment_tournament_dfs(dfs, dfs_results, acbl_api_key, acbl_number): - print_to_log('dfs keys:',dfs.keys()) + print_to_log_info('dfs keys:',dfs.keys()) df = pd.DataFrame({k:[v] for k,v in dfs.items() if not (isinstance(v,dict) or isinstance(v,list))}) - print_to_log('df:\n', df) + print_to_log_info('df:\n', df) assert len(df) == 1, len(df) - print_to_log('dfs session:',type(dfs['session'])) + print_to_log_info('dfs session:',type(dfs['session'])) df_session = pd.DataFrame({k:[v] for k,v in dfs['session'].items() if not (isinstance(v,dict) or isinstance(v,list))}) assert len(df_session) == 1, len(df_session) - print_to_log({k:pd.DataFrame(v) for k,v in dfs['session'].items() if (isinstance(v,dict) or isinstance(v,list))}) + print_to_log_info({k:pd.DataFrame(v) for k,v in dfs['session'].items() if (isinstance(v,dict) or isinstance(v,list))}) - print_to_log('dfs event:',type(dfs['event'])) + print_to_log_info('dfs event:',type(dfs['event'])) df_event = pd.DataFrame({k:[v] for k,v in dfs['event'].items() if not (isinstance(v,dict) or isinstance(v,list))}) assert len(df_event) == 1, len(df_event) - print_to_log({k:pd.DataFrame(v) for k,v in dfs['event'].items() if (isinstance(v,dict) or isinstance(v,list))}) + print_to_log_info({k:pd.DataFrame(v) for k,v in dfs['event'].items() if (isinstance(v,dict) or isinstance(v,list))}) - print_to_log('dfs tournament:',type(dfs['tournament'])) + print_to_log_info('dfs tournament:',type(dfs['tournament'])) df_tournament = pd.DataFrame({k:[v] for k,v in dfs['tournament'].items() if not (isinstance(v,dict) or isinstance(v,list))}) assert len(df_tournament) == 1, len(df_tournament) - print_to_log({k:pd.DataFrame(v) for k,v in dfs['tournament'].items() if (isinstance(v,dict) or isinstance(v,list))}) + print_to_log_info({k:pd.DataFrame(v) for k,v in dfs['tournament'].items() if (isinstance(v,dict) or isinstance(v,list))}) for col in df.columns: - print_to_log(col,df[col].dtype) + print_to_log_debug('cols:',col,df[col].dtype) # dfs scalers: ['_id', '_event_id', 'id', 'session_number', 'start_date', 'start_time', 'description', 'sess_type', 'box_number', 'is_online', 'results_available', 'was_not_played', 'results_last_updated'] # dfs dicts: ['tournament', 'event', 'handrecord', 'sections'] # dfs lists: ['overalls'] - print_to_log('dfs_results tournament:',type(dfs_results['tournament'])) + print_to_log_info('dfs_results tournament:',type(dfs_results['tournament'])) df_results_tournament = pd.DataFrame({k:[v] for k,v in dfs_results['tournament'].items() if not (isinstance(v,dict) or isinstance(v,list))}) assert len(df_results_tournament) == 1, len(df_results_tournament) - print_to_log({k:pd.DataFrame(v) for k,v in dfs_results['tournament'].items() if (isinstance(v,dict) or isinstance(v,list))}) + print_to_log_info({k:pd.DataFrame(v) for k,v in dfs_results['tournament'].items() if (isinstance(v,dict) or isinstance(v,list))}) - print_to_log('dfs_results event:',type(dfs_results['event'])) + print_to_log_info('dfs_results event:',type(dfs_results['event'])) df_results_event = pd.DataFrame({k:[v] for k,v in dfs_results['event'].items() if not (isinstance(v,dict) or isinstance(v,list))}) assert len(df_event) == 1, len(df_event) - print_to_log({k:pd.DataFrame(v) for k,v in dfs_results['event'].items() if (isinstance(v,dict) or isinstance(v,list))}) + print_to_log_info({k:pd.DataFrame(v) for k,v in dfs_results['event'].items() if (isinstance(v,dict) or isinstance(v,list))}) - print_to_log('dfs_results overalls:',type(dfs_results['overalls'])) + print_to_log_info('dfs_results overalls:',type(dfs_results['overalls'])) df_results_overalls = pd.DataFrame(dfs_results['overalls']) #assert len(df_results_overalls) == 1, len(df_results_overalls) - print_to_log(pd.DataFrame(dfs_results['overalls'])) + print_to_log_info(pd.DataFrame(dfs_results['overalls'])) - print_to_log('dfs_results handrecord:',type(dfs_results['handrecord'])) + print_to_log_info('dfs_results handrecord:',type(dfs_results['handrecord'])) df_results_handrecord = pd.DataFrame(dfs_results['handrecord']) #assert len(df_results_handrecord) == 1, len(df_results_handrecord) - print_to_log(pd.DataFrame(dfs_results['handrecord'])) + print_to_log_info(pd.DataFrame(dfs_results['handrecord'])) - print_to_log('dfs_results sections:',type(dfs_results['sections'])) + print_to_log_info('dfs_results sections:',type(dfs_results['sections'])) df_results_sections = pd.DataFrame(dfs_results['sections']) df_board_results = pd.DataFrame() @@ -309,28 +315,28 @@ def augment_tournament_df(df,sd_cache_d): def merge_clean_augment_club_dfs(dfs,sd_cache_d,acbl_number): # todo: acbl_number obsolete? - print_to_log('dfs keys:',dfs.keys()) + print_to_log_info('merge_clean_augment_club_dfs: dfs keys:',dfs.keys()) df_brs = dfs['board_results'] - print_to_log(df_brs.head(1)) + print_to_log_info(df_brs.head(1)) assert len(df_brs.filter(regex=r'_[xy]$').columns) == 0,df_brs.filter(regex=r'_[xy]$').columns df_b = dfs['boards'].rename({'id':'board_id'},axis='columns')[['board_id','section_id','board_number']] - print_to_log(df_b.head(1)) + print_to_log_info(df_b.head(1)) assert len(df_b.filter(regex=r'_[xy]$').columns) == 0,df_b.filter(regex=r'_[xy]$').columns df_br_b = pd.merge(df_brs,df_b,on='board_id',how='left') - print_to_log(df_br_b.head(1)) + print_to_log_info(df_br_b.head(1)) assert len(df_br_b) == len(df_brs) assert len(df_br_b.filter(regex=r'_[xy]$').columns) == 0,df_br_b.filter(regex=r'_[xy]$').columns df_sections = dfs['sections'].rename({'id':'section_id','name':'section_name'},axis='columns').drop(['created_at','updated_at','pair_summaries','boards'],axis='columns') # ['pair_summaries','boards'] are unwanted dicts - print_to_log(df_sections.head(1)) + print_to_log_info(df_sections.head(1)) df_br_b_sections = pd.merge(df_br_b,df_sections,on='section_id',how='left') - print_to_log(df_br_b_sections.head(1)) + print_to_log_info(df_br_b_sections.head(1)) assert len(df_br_b_sections) == len(df_br_b) assert len(df_br_b_sections.filter(regex=r'_[xy]$').columns) == 0,df_br_b_sections.filter(regex=r'_[xy]$').columns @@ -338,37 +344,37 @@ def merge_clean_augment_club_dfs(dfs,sd_cache_d,acbl_number): # todo: acbl_numbe df_sessions = dfs['sessions'].rename({'id':'session_id','number':'session_number'},axis='columns').drop(['created_at','updated_at','hand_records','sections'],axis='columns') # ['hand_records','sections'] are unwanted dicts # can't convert to int64 because SHUFFLE is a valid hand_record_id. Need to treat as string. # df_sessions['hand_record_id'] = df_sessions['hand_record_id'].astype('int64') # change now for merge - print_to_log(df_sessions.head(1)) + print_to_log_info(df_sessions.head(1)) df_br_b_sections_sessions = pd.merge(df_br_b_sections,df_sessions,on='session_id',how='left') - print_to_log(df_br_b_sections_sessions.head(1)) + print_to_log_info(df_br_b_sections_sessions.head(1)) assert len(df_br_b_sections_sessions) == len(df_br_b_sections) assert len(df_br_b_sections_sessions.filter(regex=r'_[xy]$').columns) == 0,df_br_b_sections_sessions.filter(regex=r'_[xy]$').columns df_clubs = dfs['club'].rename({'id':'event_id','name':'club_name','type':'club_type'},axis='columns').drop(['created_at','updated_at'],axis='columns') # name and type are renamed to avoid conflict with df_events - print_to_log(df_clubs.head(1)) + print_to_log_info(df_clubs.head(1)) df_br_b_sections_sessions_clubs = pd.merge(df_br_b_sections_sessions,df_clubs,on='event_id',how='left') - print_to_log(df_br_b_sections_sessions_clubs.head(1)) + print_to_log_info(df_br_b_sections_sessions_clubs.head(1)) assert len(df_br_b_sections_sessions_clubs) == len(df_br_b_sections) assert len(df_sections.filter(regex=r'_[xy]$').columns) == 0,df_sections.filter(regex=r'_[xy]$').columns df_events = dfs['event'].rename({'id':'event_id','club_name':'event_club_name','type':'event_type'},axis='columns').drop(['created_at','updated_at','deleted_at'],axis='columns') - print_to_log(df_events.head(1)) + print_to_log_info(df_events.head(1)) df_br_b_sections_sessions_events = pd.merge(df_br_b_sections_sessions_clubs,df_events,on='event_id',how='left') - print_to_log(df_br_b_sections_sessions_events.head(1)) + print_to_log_info(df_br_b_sections_sessions_events.head(1)) assert len(df_br_b_sections_sessions_events) == len(df_br_b_sections_sessions_clubs) assert len(df_br_b_sections_sessions_events.filter(regex=r'_[xy]$').columns) == 0,df_br_b_sections_sessions_events.filter(regex=r'_[xy]$').columns df_pair_summaries = dfs['pair_summaries'].rename({'id':'pair_summary_id'},axis='columns').drop(['created_at','updated_at'],axis='columns') - print_to_log(df_pair_summaries.head(1)) + print_to_log_info(df_pair_summaries.head(1)) # todo: merge df_pair_summaries with strat_place. issue is that strat_place has multiple rows per pair_summary_id df_pair_summaries_strat = df_pair_summaries @@ -391,11 +397,11 @@ def merge_clean_augment_club_dfs(dfs,sd_cache_d,acbl_number): # todo: acbl_numbe player_w = df_players.groupby('pair_summary_id').last().reset_index().add_suffix('_w').rename({'pair_summary_id_w':'pair_summary_id_ew'},axis='columns') player_ns = pd.merge(player_n,player_s,on='pair_summary_id_ns',how='left') - print_to_log(player_ns.head(1)) + print_to_log_info(player_ns.head(1)) assert len(player_ns) == len(player_n) assert len(player_ns.filter(regex=r'_[xy]$').columns) == 0,player_ns.filter(regex=r'_[xy]$').columns player_ew = pd.merge(player_e,player_w,on='pair_summary_id_ew',how='left') - print_to_log(player_ew.head(1)) + print_to_log_info(player_ew.head(1)) assert len(player_ew) == len(player_e) assert len(player_ew.filter(regex=r'_[xy]$').columns) == 0,player_ew.filter(regex=r'_[xy]$').columns @@ -407,26 +413,26 @@ def merge_clean_augment_club_dfs(dfs,sd_cache_d,acbl_number): # todo: acbl_numbe #df_pair_summary_players = pd.merge(df_pair_summary_players_ns,df_pair_summary_players_ew,how='left') # yes, on is not needed #assert len(df_pair_summary_players) == len(df_pair_summary_players_ns) # likely this is an issue on an EW sitout. Need to compare ns,ew lengths and how on the longer one? df_br_b_sections_sessions_events_pair_summary_players = pd.merge(df_br_b_sections_sessions_events,df_pair_summary_players_ns,on=('section_id','ns_pair'),how='left') # yes, requires inner. Otherwise right df non-on columns will be NaNs. - print_to_log(df_br_b_sections_sessions_events_pair_summary_players.head(1)) + print_to_log_info(df_br_b_sections_sessions_events_pair_summary_players.head(1)) assert len(df_br_b_sections_sessions_events_pair_summary_players) == len(df_br_b_sections_sessions_events) assert len(df_br_b_sections_sessions_events_pair_summary_players.filter(regex=r'_[xy]$').columns) == 0,df_br_b_sections_sessions_events_pair_summary_players.filter(regex=r'_[xy]$').columns df_br_b_sections_sessions_events_pair_summary_players = pd.merge(df_br_b_sections_sessions_events_pair_summary_players,df_pair_summary_players_ew,on=('section_id','ew_pair'),how='left') # yes, requires inner. Otherwise right df non-on columns will be NaNs. - print_to_log(df_br_b_sections_sessions_events_pair_summary_players.head(1)) + print_to_log_info(df_br_b_sections_sessions_events_pair_summary_players.head(1)) assert len(df_br_b_sections_sessions_events_pair_summary_players) == len(df_br_b_sections_sessions_events) assert len(df_br_b_sections_sessions_events_pair_summary_players.filter(regex=r'_[xy]$').columns) == 0,df_br_b_sections_sessions_events_pair_summary_players.filter(regex=r'_[xy]$').columns df_hrs = dfs['hand_records'].rename({'hand_record_set_id':'hand_record_id'},axis='columns').drop(['points.N','points.E','points.S','points.W'],axis='columns') # don't want points (HCP) from hand_records. will compute later. - print_to_log(df_hrs.head(1)) + print_to_log_info(df_hrs.head(1)) df_br_b_sections_sessions_events_pair_summary_players_hrs = pd.merge(df_br_b_sections_sessions_events_pair_summary_players,df_hrs.astype({'hand_record_id':'string'}).drop(['id','created_at','updated_at'],axis='columns'),left_on=('hand_record_id','board_number'),right_on=('hand_record_id','board'),how='left') - print_to_log(df_br_b_sections_sessions_events_pair_summary_players_hrs.head(1)) + print_to_log_info(df_br_b_sections_sessions_events_pair_summary_players_hrs.head(1)) assert len(df_br_b_sections_sessions_events_pair_summary_players_hrs) == len(df_br_b_sections_sessions_events_pair_summary_players) assert len(df_br_b_sections_sessions_events_pair_summary_players_hrs.filter(regex=r'_[xy]$').columns) == 0,df_br_b_sections_sessions_events_pair_summary_players_hrs.filter(regex=r'_[xy]$').columns df = df_br_b_sections_sessions_events_pair_summary_players_hrs for col in df.columns: - print_to_log(col,df[col].dtype) + print_to_log_info('cols:',col,df[col].dtype) df.drop(['id','created_at','updated_at','board_id','double_dummy_ns','double_dummy_ew','pair_summary_id_ns','pair_summary_id_ew'],axis='columns',inplace=True) @@ -461,7 +467,7 @@ def merge_clean_augment_club_dfs(dfs,sd_cache_d,acbl_number): # todo: acbl_numbe }) df = clean_validate_df(df) - df, sd_cache_d, matchpoint_ns_d = augment_df(df,sd_cache_d) + df, sd_cache_d, matchpoint_ns_d = augment_df(df,sd_cache_d) # takes 5s return df, sd_cache_d, matchpoint_ns_d @@ -480,9 +486,12 @@ def clean_validate_df(df): tops[b] = df[df['Board'].eq(b)]['MatchPoints_NS'].count()-1 assert tops[b] == df[df['Board'].eq(b)]['MatchPoints_EW'].count()-1 # if any rows were dropped, the calculation of board's top/pct will be wrong (outside of (0,1)). Need to calculate Board_Top before dropping any rows. + # PerformanceWarning: DataFrame is highly fragmented. df['Board_Top'] = df['Board'].map(tops) if set(['Pct_NS', 'Pct_EW']).isdisjoint(df.columns): # disjoint means no elements of set are in df.columns + # PerformanceWarning: DataFrame is highly fragmented. df['Pct_NS'] = df['MatchPoints_NS'].astype('float32').div(df['Board_Top']) + # PerformanceWarning: DataFrame is highly fragmented. df['Pct_EW'] = df['MatchPoints_EW'].astype('float32').div(df['Board_Top']) assert set(['Pct_NS', 'Pct_EW', 'Board_Top']).issubset(df.columns) # subset means all elements of the set are in df.columns df.loc[df['Pct_NS']>1,'Pct_NS'] = 1 # assuming this can only happen if director adjusts score. todo: print >1 cases. @@ -493,22 +502,27 @@ def clean_validate_df(df): # transpose pair_name (last name, first_name). for d in 'NESW': df.rename({'player_number_'+d.lower():'Player_Number_'+d},axis='columns',inplace=True) + # PerformanceWarning: DataFrame is highly fragmented. df['iPlayer_Number_'+d] = pd.to_numeric(df['Player_Number_'+d], errors='coerce').fillna(0).astype('int32') # Convert to numeric. Make NaN into 0. Create iPlayer_Number column to match ai model column name. ai likes numerics, hates strings. + # PerformanceWarning: DataFrame is highly fragmented. df['Player_Name_'+d] = df['player_name_'+d.lower()].str.split(',').str[::-1].str.join(' ') # github Copilot wrote this line! df.drop(['player_name_'+d.lower()],axis='columns',inplace=True) # clean up contracts. Create BidLvl, BidSuit, Dbl columns. contractdf = df['Contract'].str.replace(' ','').str.upper().str.replace('NT','N').str.extract(r'^(?P\d)(?PC|D|H|S|N)(?PX*)$') + # PerformanceWarning: DataFrame is highly fragmented. df['BidLvl'] = contractdf['BidLvl'] + # PerformanceWarning: DataFrame is highly fragmented. df['BidSuit'] = contractdf['BidSuit'] + # PerformanceWarning: DataFrame is highly fragmented. df['Dbl'] = contractdf['Dbl'] del contractdf # There's all sorts of exceptional crap which needs to be done for 'PASS', 'NP', 'BYE', 'AVG', 'AV+', 'AV-', 'AVG+', 'AVG-', 'AVG+/-'. Only 'PASS' is handled, the rest are dropped. drop_rows = df['Contract'].ne('PASS')&(df['Score_NS'].eq('PASS')&df['Score_EW'].eq('PASS')&df['BidLvl'].isna()|df['BidSuit'].isna()|df['Dbl'].isna()) - print_to_log('Invalid contracts: drop_rows:',drop_rows.sum(),df[drop_rows][['Contract','BidLvl','BidSuit','Dbl']]) + print_to_log(logging.WARNING, 'Invalid contracts: drop_rows:',drop_rows.sum(),df[drop_rows][['Contract','BidLvl','BidSuit','Dbl']]) df.drop(df[drop_rows].index,inplace=True) drop_rows = ~df['Declarer_Direction'].isin(list('NESW')) # keep N,S,E,W. Drop EW, NS, w, ... < 500 cases. - print_to_log('Invalid declarers: drop_rows:',drop_rows.sum(),df[drop_rows][['Declarer_Direction']]) + print_to_log(logging.WARNING, 'Invalid declarers: drop_rows:',drop_rows.sum(),df[drop_rows][['Declarer_Direction']]) df.drop(df[drop_rows].index,inplace=True) df.loc[df['Contract'].ne('PASS'),'Contract'] = df['BidLvl']+df['BidSuit']+df['Dbl']+df['Declarer_Direction'] df['BidLvl'] = df['BidLvl'].astype('UInt8') # using UInt8 instead of uint8 because of NaNs @@ -522,9 +536,11 @@ def clean_validate_df(df): assert df['table_number'].isna().all() or df['table_number'].ge(1).all() # some events have NaN table_numbers. # create more useful Vul column + # PerformanceWarning: DataFrame is highly fragmented. df['Vul'] = df['Board'].map(mlBridgeLib.BoardNumberToVul).astype('uint8') # 0 to 3 if not pd.api.types.is_numeric_dtype(df['Score_NS']): + # PerformanceWarning: DataFrame is highly fragmented. df['Score_NS'] = df['Score_NS'].astype('string') # make sure all elements are a string df.loc[df['Score_NS'].eq('PASS'),'Score_NS'] = '0' assert df['Score_NS'].ne('PASS').all() @@ -536,6 +552,7 @@ def clean_validate_df(df): df['Score_EW'] = -df['Score_NS'] # tournaments do not have Tricks or Result columns. Create them. + # PerformanceWarning: DataFrame is highly fragmented. df['scores_l'] = mlBridgeLib.ContractToScores(df) # todo: ValueError: Cannot set a DataFrame with multiple columns to the single column scores_l on if 'Result' in df: assert df['Result'].notna().all() and df['Result'].notnull().all() @@ -543,7 +560,7 @@ def clean_validate_df(df): else: df['Result'] = df.apply(lambda r: pd.NA if r['Score_NS'] not in r['scores_l'] else r['scores_l'].index(r['Score_NS'])-(r['BidLvl']+6),axis='columns').astype('Int8') # pd.NA is due to director's adjustment if df['Result'].isna().any(): - print_to_log('NaN Results:\n',df[df['Result'].isna()][['Board','Contract','BidLvl','BidSuit','Dbl','Declarer_Direction','Score_NS','Score_EW','Result','scores_l']]) + print_to_log_info('NaN Results:\n',df[df['Result'].isna()][['Board','Contract','BidLvl','BidSuit','Dbl','Declarer_Direction','Score_NS','Score_EW','Result','scores_l']]) assert df['Result'].map(lambda x: x is pd.NA or -13 <= x <= 13).all() if 'Tricks' in df and df['Tricks'].notnull().all(): # tournaments have a Trick column with all None(?). @@ -552,7 +569,7 @@ def clean_validate_df(df): else: df['Tricks'] = df.apply(lambda r: pd.NA if r['BidLvl'] is pd.NA or r['Result'] is pd.NA else r['BidLvl']+6+r['Result'],axis='columns') # pd.NA is needed for PASS if df['Tricks'].isna().any(): - print_to_log('NaN Tricks:\n',df[df['Tricks'].isna()][['Board','Contract','BidLvl','BidSuit','Dbl','Declarer_Direction','Score_NS','Score_EW','Tricks','Result','scores_l']]) + print_to_log_info('NaN Tricks:\n',df[df['Tricks'].isna()][['Board','Contract','BidLvl','BidSuit','Dbl','Declarer_Direction','Score_NS','Score_EW','Tricks','Result','scores_l']]) df['Tricks'] = df['Tricks'].astype('UInt8') assert df['Tricks'].map(lambda x: x is pd.NA or 0 <= x <= 13).all() @@ -592,6 +609,7 @@ def TuplesToSuits(df,tuples,column,excludes=[]): d['_'.join([column,direction,suit])] = tuples.map(lambda x: x[1][i][1][j]+x[1][i+2][1][j]) for k,v in d.items(): if k not in excludes: + # PerformanceWarning: DataFrame is highly fragmented. df[k] = v return d @@ -636,15 +654,21 @@ def augment_df(df,sd_cache_d): TuplesToSuits(df,sl,'SL',['SL','SL_N','SL_E','SL_S','SL_W','SL_NS','SL_EW']) so = mlBridgeLib.CDHS for d in mlBridgeLib.NESW: + # PerformanceWarning: DataFrame is highly fragmented. df[f'SL_{d}_{so}'] = df.filter(regex=f'^SL_{d}_[{so}]$').values.tolist() # ordered from clubs to spades [CDHS] + # PerformanceWarning: DataFrame is highly fragmented. df[f'SL_{d}_{so}_J'] = df[f'SL_{d}_{so}'].map(lambda l:'-'.join([str(v) for v in l])).astype('category') # joined CDHS into category + # PerformanceWarning: DataFrame is highly fragmented. df[f'SL_{d}_ML_S'] = df[f'SL_{d}_{so}'].map(lambda l: [v for v,n in sorted([(ll,n) for n,ll in enumerate(l)],key=lambda k:(-k[0],k[1]))]) # ordered most-to-least + # PerformanceWarning: DataFrame is highly fragmented. df[f'SL_{d}_ML_SI'] = df[f'SL_{d}_{so}'].map(lambda l: [n for v,n in sorted([(ll,n) for n,ll in enumerate(l)],key=lambda k:(-k[0],k[1]))]) # ordered most-to-least containing indexes + # PerformanceWarning: DataFrame is highly fragmented. df[f'SL_{d}_ML_SJ'] = df[f'SL_{d}_ML_S'].map(lambda l:'-'.join([str(v) for v in l])).astype('category') # ordered most-to-least and joined into category # Create columns containing column names of the NS,EW longest suit. sl_cols = [('_'.join(['SL_Max',d]),['_'.join(['SL',d,s]) for s in mlBridgeLib.SHDC]) for d in mlBridgeLib.NS_EW] for d in sl_cols: + # PerformanceWarning: DataFrame is highly fragmented. df[d[0]] = df[d[1]].idxmax(axis=1).astype('category') # defaults to object so need string or category df = mlBridgeLib.append_double_dummy_results(df) @@ -659,6 +683,7 @@ def augment_df(df,sd_cache_d): df = df.astype({'LoTT_Tricks':'uint8','LoTT_Suit_Length':'uint8','LoTT_Variance':'int8'}) # ContractType + # PerformanceWarning: DataFrame is highly fragmented. df['ContractType'] = df.apply(lambda r: 'PASS' if r['BidLvl'] is pd.NA else mlBridgeLib.ContractType(r['BidLvl']+6,r['BidSuit']),axis='columns').astype('category') # Create column of contract types by partnership by suit. e.g. CT_NS_C. contract_types_d = mlBridgeLib.CategorifyContractTypeBySuit(ddmakes) @@ -683,7 +708,7 @@ def augment_df(df,sd_cache_d): for board,g in df.groupby('Board'): for score_ns,match_points_ns in zip(g['Score_NS'],g['MatchPoints_NS'].astype('float32')): if matchpoint_ns_d[board][score_ns][3] != match_points_ns: # match_points_ns is a string because it might originally have AVG+ or AVG- etc. - print_to_log(f'Board {board} score {matchpoint_ns_d[board][score_ns][3]} tuple {matchpoint_ns_d[board][score_ns]} does not match matchpoint score {match_points_ns}') # ok if off by epsilon + print_to_log(logging.WARNING,f'Board {board} score {matchpoint_ns_d[board][score_ns][3]} tuple {matchpoint_ns_d[board][score_ns]} does not match matchpoint score {match_points_ns}') # ok if off by epsilon # Vul columns df['Vul_NS'] = (df['Vul']&1).astype('bool') @@ -706,7 +731,7 @@ def augment_df(df,sd_cache_d): # recompute Score and compare against actual scores to catch scoring errors such as: Board 1 at https://my.acbl.org/club-results/details/878121 df['Computed_Score_Declarer'] = df.apply(lambda r: 0 if r['BidLvl'] is pd.NA else mlBridgeLib.score(r['BidLvl']-1, 'CDHSN'.index(r['BidSuit']), len(r['Dbl']), ('NESW').index(r['Declarer_Direction']), r['Vul_Declarer'], r['Result'],True), axis='columns') if (df['Score_Declarer'].ne(df['Computed_Score_Declarer'])|df['Score_NS'].ne(-df['Score_EW'])).any(): - print_to_log('Invalid Scores:\n',df[df['Score_Declarer'].ne(df['Computed_Score_Declarer'])|df['Score_NS'].ne(-df['Score_EW'])][['Board','Contract','BidLvl','BidSuit','Dbl','Declarer_Direction','Vul_Declarer','Score_Declarer','Computed_Score_Declarer','Score_NS','Score_EW','Result']]) + print_to_log(logging.WARNING, 'Invalid Scores:\n',df[df['Score_Declarer'].ne(df['Computed_Score_Declarer'])|df['Score_NS'].ne(-df['Score_EW'])][['Board','Contract','BidLvl','BidSuit','Dbl','Declarer_Direction','Vul_Declarer','Score_Declarer','Computed_Score_Declarer','Score_NS','Score_EW','Result']]) df['MPs_Declarer'] = df.apply(lambda r: r['MatchPoints_'+r['Pair_Declarer_Direction']], axis='columns') df['DDTricks'] = df.apply(lambda r: pd.NA if r['BidLvl'] is pd.NA else r['_'.join(['DD',r['Declarer_Direction'],r['BidSuit']])], axis='columns') # invariant diff --git a/mlBridgeLib/mlBridgeAi.py b/mlBridgeLib/mlBridgeAi.py index 7faefb2..fd9543b 100644 --- a/mlBridgeLib/mlBridgeAi.py +++ b/mlBridgeLib/mlBridgeAi.py @@ -1,14 +1,22 @@ import logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # or DEBUG logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -def print_to_log(*args): - logging.info(' '.join(str(arg) for arg in args)) +def print_to_log_info(*args): + print_to_log(logging.INFO, *args) +def print_to_log_debug(*args): + print_to_log(logging.DEBUG, *args) +def print_to_log(level, *args): + logging.log(level, ' '.join(str(arg) for arg in args)) import pandas as pd import os from fastai.tabular.all import nn, load_learner, tabular_learner, cont_cat_split, TabularDataLoaders, TabularPandas, CategoryBlock, RegressionBlock, Categorify, FillMissing, Normalize, EarlyStoppingCallback, RandomSplitter, range_of, MSELossFlat, rmse, accuracy +import time def train_classifier(df, y_names, cat_names, cont_names, procs=None, valid_pct=0.2, seed=42, bs=1024*5, layers=[512,512,512], epochs=3, device='cuda'): + t = time.time() splits_ilocs = RandomSplitter(valid_pct=valid_pct, seed=seed)(range_of(df)) to = TabularPandas(df, procs=procs, cat_names=cat_names, @@ -26,21 +34,23 @@ def train_classifier(df, y_names, cat_names, cont_names, procs=None, valid_pct=0 # Train the model learn.fit_one_cycle(epochs) # 1 or 2 epochs is enough to get a good accuracy for large datasets + print_to_log_info('train_classifier time:', time.time()-t) return to, dls, learn +# obsolete? def load_data(df, y_names=None, cont_names=None, cat_names=None, procs=None, y_block=None, bs=None, layers=[1024]*4, valid_pct=None, seed=42, max_card=None, device='cuda'): """ Load and preprocess data using FastAI. """ - print_to_log(f"{y_names=} {cont_names=} {cat_names=} {bs=} {valid_pct=} {max_card=}") + print_to_log_info(f"{y_names=} {cont_names=} {cat_names=} {bs=} {valid_pct=} {max_card=}") # Determine number of CPU cores and set workers to cores-1 num_workers = os.cpu_count() - 1 - print_to_log(f"{y_names=} {bs=} {valid_pct=} {num_workers=}") + print_to_log_info(f"{y_names=} {bs=} {valid_pct=} {num_workers=}") if cont_names is not None: - print_to_log(f"{len(cont_names)=} {cont_names=}") + print_to_log_info(f"{len(cont_names)=} {cont_names=}") if cat_names is not None: - print_to_log(f"{len(cat_names)=} {cat_names=}") + print_to_log_info(f"{len(cat_names)=} {cat_names=}") # doesn't work for Contract. assert df.select_dtypes(include=['object','string']).columns.size == 0, df.select_dtypes(include=['object','string']).columns assert not df.isna().any().any() assert y_names in df, y_names @@ -49,9 +59,9 @@ def load_data(df, y_names=None, cont_names=None, cat_names=None, procs=None, y_b if cont_names is None and cat_names is None: cont_names, cat_names = cont_cat_split(df, max_card=max_card, dep_var=y_names) if cont_names is not None: - print_to_log(f"{len(cont_names)=} {cont_names=}") + print_to_log_info(f"{len(cont_names)=} {cont_names=}") if cat_names is not None: - print_to_log(f"{len(cat_names)=} {cat_names=}") + print_to_log_info(f"{len(cat_names)=} {cat_names=}") assert y_names not in [cont_names + cat_names] assert set(cont_names).intersection(cat_names) == set(), set(cont_names).intersection(cat_names) assert set(cont_names+cat_names+[y_names]).symmetric_difference(df.columns) == set(), set(cont_names+cat_names+[y_names]).symmetric_difference(df.columns) @@ -79,11 +89,12 @@ def load_data(df, y_names=None, cont_names=None, cat_names=None, procs=None, y_b return dls # return to? +# obsolete? def train_classification(dls, epochs=3, monitor='accuracy', min_delta=0.001, patience=3): """ Train a tabular model for classification. """ - print_to_log(f"{epochs=} {monitor=} {min_delta=} {patience=}") + print_to_log_info(f"{epochs=} {monitor=} {min_delta=} {patience=}") # Create a tabular learner learn = tabular_learner(dls, metrics=accuracy) @@ -101,7 +112,7 @@ def train_regression(dls, epochs=20, layers=[200]*10, y_range=(0,1), monitor='va """ Train a tabular model for regression. """ - print_to_log(f"{epochs=} {layers=} {y_range=} {monitor=} {min_delta=} {patience=}") + print_to_log_info(f"{epochs=} {layers=} {y_range=} {monitor=} {min_delta=} {patience=}") # todo: check that y_names is numeric, not category. learn = tabular_learner(dls, layers=layers, metrics=rmse, y_range=y_range, loss_func=MSELossFlat()) # todo: could try loss_func=L1LossFlat. @@ -116,17 +127,25 @@ def train_regression(dls, epochs=20, layers=[200]*10, y_range=(0,1), monitor='va return learn def save_model(learn, f): + t = time.time() learn.export(f) + print_to_log_info('save_model time:', time.time()-t) def load_model(f): - return load_learner(f) + t = time.time() + learn = load_learner(f) + print_to_log_info('load_model time:', time.time()-t) + return learn def get_predictions(learn, data, device='cpu'): - data[learn.dls.train.x_names].info(verbose=True) - data[learn.dls.train.y_names].info(verbose=True) + t = time.time() + if logger.isEnabledFor(logging.DEBUG): + data[learn.dls.train.x_names].info(verbose=True) + data[learn.dls.train.y_names].info(verbose=True) assert set(learn.dls.train.x_names).difference(data.columns) == set(), f"df is missing column names which are in the model's training set:{set(learn.dls.train.x_names).difference(data.columns)}" dl = learn.dls.test_dl(data, device=device) probs, actual = learn.get_preds(dl=dl) + print_to_log_info('get_predictions time:', time.time()-t) return probs, actual def predictions_to_df(data, y_names, preds): @@ -205,7 +224,8 @@ def make_predictions(f, data): # model_state_dict = torch.load(f, map_location=torch.device('cpu')) # print_to_log('y_name:', y_name, 'columns_to_scale:', columns_to_scale) - # st.session_state.df.info(verbose=True) + # if logging.isEnabledFor(logging.DEBUG): + # st.session_state.df.info(verbose=True) # assert set(columns_to_scale).difference(set(st.session_state.df.columns)) == set(), set(columns_to_scale).difference(set(st.session_state.df.columns)) # df = st.session_state.df.copy() diff --git a/mlBridgeLib/mlBridgeLib.py b/mlBridgeLib/mlBridgeLib.py index e79ea1c..366aeaa 100644 --- a/mlBridgeLib/mlBridgeLib.py +++ b/mlBridgeLib/mlBridgeLib.py @@ -11,9 +11,15 @@ import logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # or DEBUG logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -def print_to_log(*args): - logging.info(' '.join(str(arg) for arg in args)) +def print_to_log_info(*args): + print_to_log(logging.INFO, *args) +def print_to_log_debug(*args): + print_to_log(logging.DEBUG, *args) +def print_to_log(level, *args): + logging.log(level, ' '.join(str(arg) for arg in args)) import numpy as np import pandas as pd @@ -277,12 +283,12 @@ def validate_brs(brs): sorted_brs = '22223333444455556666777788889999AAAACCCCDDDDHHHHJJJJKKKKQQQQSSSSTTTT' # sorted brs must match this string s = brs.replace('10','T') if ''.join(sorted(s)) != sorted_brs: - print_to_log('validate_brs: Invalid brs:', brs, s) + print_to_log_info('validate_brs: Invalid brs:', brs, s) return False for i in range(0,len(sorted_brs),len(sorted_brs)*4): split_shdc = re.split(r'[SHDC]',s[i:i+13+4]) if len(split_shdc) != 4+1 or sum(map(len,split_shdc)) != 13: # not validating sort order. call it correct-ish. - print_to_log('validate_brs: Invalid len:', i, brs, s[i:i+13+4], split_shdc) + print_to_log_info('validate_brs: Invalid len:', i, brs, s[i:i+13+4], split_shdc) return False return True @@ -479,7 +485,7 @@ def CategorifyContractTypeByDirection(df): cols = df.filter(regex=r'CT_(NS|EW)_[CDHSN]').columns for c in cols: for t in contract_types: - print_to_log(c,t,len((t == df[c]).values)) + print_to_log_debug('CT:',c,t,len((t == df[c]).values)) new_c = c+'_'+t contract_types_d[new_c] = (t == df[c]).values return contract_types_d @@ -621,7 +627,7 @@ def FilterBoards(df, cn=None, vul=None, direction=None, suit=None, contractType= elif vul == 'Both': df = df[df['Vul_NS'] & df['Vul_NS']] # only Both else: - print_to_log(f'FilterBoards: Error: Invalid vul:{vul}') + print_to_log_info(f'FilterBoards: Error: Invalid vul:{vul}') if not direction is None: # either 'NS','EW' # Single direction is problematic so using NS, EW df = df[df['Par_Dir'] == direction] @@ -942,7 +948,7 @@ def ListOfClubsToProcess(clubNumbers, inputFiles, outputFiles, clubsPath, forceR clubDir = clubsPath.joinpath(clubNumber.name) # all input files must exist if sum([not clubDir.joinpath(inputFileToProcess).exists() for inputFileToProcess in inputFiles]) != 0: - print_to_log( + print_to_log_info( f'ListOfClubsToProcess: Club {clubNumber.name} has some missing input files: {inputFiles}: skipping.') continue # creating list of input file sizes, first file only, for later sorting. @@ -1143,7 +1149,7 @@ def json_walk_print(key,value): else: if type(value) is str: value = '"'+value+'"' - print_to_log(key+'='+str(value)) + print_to_log_debug(key+'='+str(value)) return diff --git a/streamlitlib/streamlitlib.py b/streamlitlib/streamlitlib.py index 909c56e..e9b36be 100644 --- a/streamlitlib/streamlitlib.py +++ b/streamlitlib/streamlitlib.py @@ -1,7 +1,13 @@ import logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # or DEBUG logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -def print_to_log(*args): - logging.info(' '.join(str(arg) for arg in args)) +def print_to_log_info(*args): + print_to_log(logging.INFO, *args) +def print_to_log_debug(*args): + print_to_log(logging.DEBUG, *args) +def print_to_log(level, *args): + logging.log(level, ' '.join(str(arg) for arg in args)) import streamlit as st import pandas as pd @@ -309,7 +315,7 @@ def create_pdf(pdf_assets, title, output_filename=None): story.extend(markdown_to_paragraphs(a, styles)) # Convert each DataFrame in the list to a reportlab table and add it to the story elif isinstance(a, pd.DataFrame): - print_to_log('a:',len(a),len(a.columns)) + print_to_log_info('a:',len(a),len(a.columns)) if len(a.columns) == 1: a = pd.concat([a,pd.Series('',name='',index=a.index)],axis='columns') # workaround: 1 column dataframes error out so append a blank column story.append(dataframe_to_table(a.iloc[0:30,0:11])) # take only first 30 rows and 12 columns