Skip to content

Commit

Permalink
add perf monitoring and more spinners
Browse files Browse the repository at this point in the history
  • Loading branch information
BSalita committed Jan 29, 2024
1 parent 2594788 commit 8a0be5c
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 198 deletions.
171 changes: 91 additions & 80 deletions acbllib/acbllib.py

Large diffs are not rendered by default.

98 changes: 68 additions & 30 deletions app.py

Large diffs are not rendered by default.

151 changes: 88 additions & 63 deletions chatlib/chatlib.py

Large diffs are not rendered by default.

48 changes: 34 additions & 14 deletions mlBridgeLib/mlBridgeAi.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@

import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) # or DEBUG
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def print_to_log(*args):
logging.info(' '.join(str(arg) for arg in args))
def print_to_log_info(*args):
print_to_log(logging.INFO, *args)
def print_to_log_debug(*args):
print_to_log(logging.DEBUG, *args)
def print_to_log(level, *args):
logging.log(level, ' '.join(str(arg) for arg in args))

import pandas as pd
import os
from fastai.tabular.all import nn, load_learner, tabular_learner, cont_cat_split, TabularDataLoaders, TabularPandas, CategoryBlock, RegressionBlock, Categorify, FillMissing, Normalize, EarlyStoppingCallback, RandomSplitter, range_of, MSELossFlat, rmse, accuracy
import time

def train_classifier(df, y_names, cat_names, cont_names, procs=None, valid_pct=0.2, seed=42, bs=1024*5, layers=[512,512,512], epochs=3, device='cuda'):
t = time.time()
splits_ilocs = RandomSplitter(valid_pct=valid_pct, seed=seed)(range_of(df))
to = TabularPandas(df, procs=procs,
cat_names=cat_names,
Expand All @@ -26,21 +34,23 @@ def train_classifier(df, y_names, cat_names, cont_names, procs=None, valid_pct=0

# Train the model
learn.fit_one_cycle(epochs) # 1 or 2 epochs is enough to get a good accuracy for large datasets
print_to_log_info('train_classifier time:', time.time()-t)
return to, dls, learn

# obsolete?
def load_data(df, y_names=None, cont_names=None, cat_names=None, procs=None, y_block=None, bs=None, layers=[1024]*4, valid_pct=None, seed=42, max_card=None, device='cuda'):
"""
Load and preprocess data using FastAI.
"""

print_to_log(f"{y_names=} {cont_names=} {cat_names=} {bs=} {valid_pct=} {max_card=}")
print_to_log_info(f"{y_names=} {cont_names=} {cat_names=} {bs=} {valid_pct=} {max_card=}")
# Determine number of CPU cores and set workers to cores-1
num_workers = os.cpu_count() - 1
print_to_log(f"{y_names=} {bs=} {valid_pct=} {num_workers=}")
print_to_log_info(f"{y_names=} {bs=} {valid_pct=} {num_workers=}")
if cont_names is not None:
print_to_log(f"{len(cont_names)=} {cont_names=}")
print_to_log_info(f"{len(cont_names)=} {cont_names=}")
if cat_names is not None:
print_to_log(f"{len(cat_names)=} {cat_names=}")
print_to_log_info(f"{len(cat_names)=} {cat_names=}")
# doesn't work for Contract. assert df.select_dtypes(include=['object','string']).columns.size == 0, df.select_dtypes(include=['object','string']).columns
assert not df.isna().any().any()
assert y_names in df, y_names
Expand All @@ -49,9 +59,9 @@ def load_data(df, y_names=None, cont_names=None, cat_names=None, procs=None, y_b
if cont_names is None and cat_names is None:
cont_names, cat_names = cont_cat_split(df, max_card=max_card, dep_var=y_names)
if cont_names is not None:
print_to_log(f"{len(cont_names)=} {cont_names=}")
print_to_log_info(f"{len(cont_names)=} {cont_names=}")
if cat_names is not None:
print_to_log(f"{len(cat_names)=} {cat_names=}")
print_to_log_info(f"{len(cat_names)=} {cat_names=}")
assert y_names not in [cont_names + cat_names]
assert set(cont_names).intersection(cat_names) == set(), set(cont_names).intersection(cat_names)
assert set(cont_names+cat_names+[y_names]).symmetric_difference(df.columns) == set(), set(cont_names+cat_names+[y_names]).symmetric_difference(df.columns)
Expand Down Expand Up @@ -79,11 +89,12 @@ def load_data(df, y_names=None, cont_names=None, cat_names=None, procs=None, y_b

return dls # return to?

# obsolete?
def train_classification(dls, epochs=3, monitor='accuracy', min_delta=0.001, patience=3):
"""
Train a tabular model for classification.
"""
print_to_log(f"{epochs=} {monitor=} {min_delta=} {patience=}")
print_to_log_info(f"{epochs=} {monitor=} {min_delta=} {patience=}")

# Create a tabular learner
learn = tabular_learner(dls, metrics=accuracy)
Expand All @@ -101,7 +112,7 @@ def train_regression(dls, epochs=20, layers=[200]*10, y_range=(0,1), monitor='va
"""
Train a tabular model for regression.
"""
print_to_log(f"{epochs=} {layers=} {y_range=} {monitor=} {min_delta=} {patience=}")
print_to_log_info(f"{epochs=} {layers=} {y_range=} {monitor=} {min_delta=} {patience=}")
# todo: check that y_names is numeric, not category.

learn = tabular_learner(dls, layers=layers, metrics=rmse, y_range=y_range, loss_func=MSELossFlat()) # todo: could try loss_func=L1LossFlat.
Expand All @@ -116,17 +127,25 @@ def train_regression(dls, epochs=20, layers=[200]*10, y_range=(0,1), monitor='va
return learn

def save_model(learn, f):
t = time.time()
learn.export(f)
print_to_log_info('save_model time:', time.time()-t)

def load_model(f):
return load_learner(f)
t = time.time()
learn = load_learner(f)
print_to_log_info('load_model time:', time.time()-t)
return learn

def get_predictions(learn, data, device='cpu'):
data[learn.dls.train.x_names].info(verbose=True)
data[learn.dls.train.y_names].info(verbose=True)
t = time.time()
if logger.isEnabledFor(logging.DEBUG):
data[learn.dls.train.x_names].info(verbose=True)
data[learn.dls.train.y_names].info(verbose=True)
assert set(learn.dls.train.x_names).difference(data.columns) == set(), f"df is missing column names which are in the model's training set:{set(learn.dls.train.x_names).difference(data.columns)}"
dl = learn.dls.test_dl(data, device=device)
probs, actual = learn.get_preds(dl=dl)
print_to_log_info('get_predictions time:', time.time()-t)
return probs, actual

def predictions_to_df(data, y_names, preds):
Expand Down Expand Up @@ -205,7 +224,8 @@ def make_predictions(f, data):
# model_state_dict = torch.load(f, map_location=torch.device('cpu'))

# print_to_log('y_name:', y_name, 'columns_to_scale:', columns_to_scale)
# st.session_state.df.info(verbose=True)
# if logging.isEnabledFor(logging.DEBUG):
# st.session_state.df.info(verbose=True)
# assert set(columns_to_scale).difference(set(st.session_state.df.columns)) == set(), set(columns_to_scale).difference(set(st.session_state.df.columns))

# df = st.session_state.df.copy()
Expand Down
22 changes: 14 additions & 8 deletions mlBridgeLib/mlBridgeLib.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,15 @@


import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) # or DEBUG
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def print_to_log(*args):
logging.info(' '.join(str(arg) for arg in args))
def print_to_log_info(*args):
print_to_log(logging.INFO, *args)
def print_to_log_debug(*args):
print_to_log(logging.DEBUG, *args)
def print_to_log(level, *args):
logging.log(level, ' '.join(str(arg) for arg in args))

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -277,12 +283,12 @@ def validate_brs(brs):
sorted_brs = '22223333444455556666777788889999AAAACCCCDDDDHHHHJJJJKKKKQQQQSSSSTTTT' # sorted brs must match this string
s = brs.replace('10','T')
if ''.join(sorted(s)) != sorted_brs:
print_to_log('validate_brs: Invalid brs:', brs, s)
print_to_log_info('validate_brs: Invalid brs:', brs, s)
return False
for i in range(0,len(sorted_brs),len(sorted_brs)*4):
split_shdc = re.split(r'[SHDC]',s[i:i+13+4])
if len(split_shdc) != 4+1 or sum(map(len,split_shdc)) != 13: # not validating sort order. call it correct-ish.
print_to_log('validate_brs: Invalid len:', i, brs, s[i:i+13+4], split_shdc)
print_to_log_info('validate_brs: Invalid len:', i, brs, s[i:i+13+4], split_shdc)
return False
return True

Expand Down Expand Up @@ -479,7 +485,7 @@ def CategorifyContractTypeByDirection(df):
cols = df.filter(regex=r'CT_(NS|EW)_[CDHSN]').columns
for c in cols:
for t in contract_types:
print_to_log(c,t,len((t == df[c]).values))
print_to_log_debug('CT:',c,t,len((t == df[c]).values))
new_c = c+'_'+t
contract_types_d[new_c] = (t == df[c]).values
return contract_types_d
Expand Down Expand Up @@ -621,7 +627,7 @@ def FilterBoards(df, cn=None, vul=None, direction=None, suit=None, contractType=
elif vul == 'Both':
df = df[df['Vul_NS'] & df['Vul_NS']] # only Both
else:
print_to_log(f'FilterBoards: Error: Invalid vul:{vul}')
print_to_log_info(f'FilterBoards: Error: Invalid vul:{vul}')
if not direction is None:
# either 'NS','EW' # Single direction is problematic so using NS, EW
df = df[df['Par_Dir'] == direction]
Expand Down Expand Up @@ -942,7 +948,7 @@ def ListOfClubsToProcess(clubNumbers, inputFiles, outputFiles, clubsPath, forceR
clubDir = clubsPath.joinpath(clubNumber.name)
# all input files must exist
if sum([not clubDir.joinpath(inputFileToProcess).exists() for inputFileToProcess in inputFiles]) != 0:
print_to_log(
print_to_log_info(
f'ListOfClubsToProcess: Club {clubNumber.name} has some missing input files: {inputFiles}: skipping.')
continue
# creating list of input file sizes, first file only, for later sorting.
Expand Down Expand Up @@ -1143,7 +1149,7 @@ def json_walk_print(key,value):
else:
if type(value) is str:
value = '"'+value+'"'
print_to_log(key+'='+str(value))
print_to_log_debug(key+'='+str(value))
return


Expand Down
12 changes: 9 additions & 3 deletions streamlitlib/streamlitlib.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) # or DEBUG
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def print_to_log(*args):
logging.info(' '.join(str(arg) for arg in args))
def print_to_log_info(*args):
print_to_log(logging.INFO, *args)
def print_to_log_debug(*args):
print_to_log(logging.DEBUG, *args)
def print_to_log(level, *args):
logging.log(level, ' '.join(str(arg) for arg in args))

import streamlit as st
import pandas as pd
Expand Down Expand Up @@ -309,7 +315,7 @@ def create_pdf(pdf_assets, title, output_filename=None):
story.extend(markdown_to_paragraphs(a, styles))
# Convert each DataFrame in the list to a reportlab table and add it to the story
elif isinstance(a, pd.DataFrame):
print_to_log('a:',len(a),len(a.columns))
print_to_log_info('a:',len(a),len(a.columns))
if len(a.columns) == 1:
a = pd.concat([a,pd.Series('',name='',index=a.index)],axis='columns') # workaround: 1 column dataframes error out so append a blank column
story.append(dataframe_to_table(a.iloc[0:30,0:11])) # take only first 30 rows and 12 columns
Expand Down

0 comments on commit 8a0be5c

Please sign in to comment.