In [None]:
import pandas as pd
import numpy as np
# # set up dictionaries of parameter options for each learner
param_opts = {
    'DecisionTreeClassifier': {},
    'GradientBoostingClassifier': {},
    'KNeighborsClassifier': {},
    'LogisticRegression': {},
    'RandomForestClassifier': {},
    'SVC': {}
}
ml_p = pd.read_csv('ml_p_options.csv')
ml_p.rename(columns={'alg_name':'classifier'},inplace=True)
for ml, df_ml in ml_p.groupby('classifier'):
    for p, df_ml_p in df_ml.groupby('parameters'):
        d = eval(p)
        for keys,v in d.items():
            if keys not in param_opts[ml].keys():
                param_opts[ml][keys] = [v]
            elif v not in param_opts[ml][keys]:
                param_opts[ml][keys].append(v)
print(param_opts)

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('sklearn-benchmark5-data-edited.tsv.gz', sep='\t', names=['dataset',
                                                                     'classifier',
                                                                     'parameters',
                                                                     'accuracy', 
                                                                     'macrof1',
                                                                     'bal_accuracy']).fillna('')
print(data.head())
data['accuracy'] = data['accuracy'].apply(lambda x: round(x, 3))
print('loaded ',data['dataset'].unique().shape[0],'datasets and ', data['classifier'].unique().shape[0],'classifiers')
# subset data to classifiers used in PennAI
pennai_classifiers = ['LogisticRegression', 'RandomForestClassifier', 'SVC', 
                      'KNeighborsClassifier', 'DecisionTreeClassifier', 'GradientBoostingClassifier']
mask = np.array([c in pennai_classifiers for c in data['classifier'].values])
data = data.loc[mask,:]
print('datasets (',len(data['dataset'].unique()),')')
print('classifiers (',len(data['classifier'].unique()),'):',data['classifier'].unique())
for ml, df_g in data.groupby('classifier'):
    print('parameters for ',ml,'(',len(df_g['parameters'].unique()),'):',df_g['parameters'].unique()[:5])


In [None]:
winner = {}
winner_datasets = {}
import numpy as np
for d, df in data.groupby('dataset'):
    df['bal_accuracy'] = df['bal_accuracy'].apply(pd.to_numeric)
    best_score = df['bal_accuracy'].max()
    for clf, dfg in df.groupby('classifier'):
        wins = np.any((best_score - dfg.bal_accuracy)/best_score <= 0.01)
        if wins:
            if clf in winner:
                winner[clf] += 1
                winner_datasets[clf].append(d)
            else:
                winner[clf] = 1
                winner_datasets[clf] = [d]
import matplotlib.pyplot as plt
%matplotlib inline
# print(winner)
# print(winner_datasets)
plt.bar(list(winner.keys()),list(winner.values()))
plt.xticks(rotation=90)               

# reduce GBC

In [None]:
n_est = ['n_estimators='+str(n) for n in [100,500,1000]]
max_depth = ['max_depth='+str(d) for d in [4,6,10]]
max_features = ['max_features='+str(f) for f in ['sqrt','log2']]

gbc = data['classifier']=='GradientBoostingClassifier'
mask = [False for g in gbc]
for n in n_est:
    mask = mask | (gbc & np.array([n in p for p in data['parameters'].values]))
mask = (mask | ~gbc)
data = data.loc[mask,:]

gbc = data['classifier']=='GradientBoostingClassifier'
mask = [False for g in gbc]
for n in max_depth:
    mask = mask | (gbc & np.array([n in p for p in data['parameters'].values]))
mask = (mask | ~gbc)
data = data.loc[mask,:]

gbc = data['classifier']=='GradientBoostingClassifier'
mask = [False for g in gbc]
for n in max_features:
    mask = mask | (gbc & np.array([n in p for p in data['parameters'].values]))
mask = (mask | ~gbc)
data = data.loc[mask,:]

# reduce RF

In [None]:
# reduce Random Forest n_estimators

rfc = data['classifier']=='RandomForestClassifier'
mask = [False for g in rfc]
for n in n_est:
    mask = mask | (rfc & np.array([n in p for p in data['parameters'].values]))
mask = (mask | ~rfc)
data = data.loc[mask,:]

max_features = ['max_features='+str(f) for f in ['sqrt','log2',None]]
rfc = data['classifier']=='RandomForestClassifier'
mask = [False for g in rfc]
for n in max_features:
    mask = mask | (rfc & np.array([n in p for p in data['parameters'].values]))
mask = (mask | ~rfc)
data = data.loc[mask,:]

data.groupby('classifier').count()

In [None]:
from collections import OrderedDict

def convert_params(params):
    """convert from sklearn-style parameter formatting to dictionary, PennAI style."""
    pdict = {}
    for kv in params.split(','):
        if len(kv)==0: continue
#         print(kv.split('='))
        pdict[str(kv.split('=')[0])] = kv.split('=')[1]
    for k,v in pdict.items():
        try:
            pdict[k] = int(v)
        except ValueError:
            try:     
                pdict[k] = float(v)
            except ValueError:
                pass

    return OrderedDict(sorted(pdict.items()))


data['parameters'] = data['parameters'].apply(lambda x: convert_params(x))

In [None]:
for ml, df_g in data.groupby('classifier'):
    print('example parameters for ',ml,'(',len(df_g['parameters'].apply(str).unique()),'):',
           df_g['parameters'].apply(str).unique()[0])

In [None]:
#def valid_param_combo(ml,params):
#     print('ml',ml,type(ml).__name__)
#     print('params',params,type(params).__name__)

#    for k,v in params.items():
#        if k in param_opts[ml].keys():
#            if param_opts[ml][k] is int:
#                try:
#                    if int(v) in param_opts[ml][k]:
#                        return True
#                except: 
#                    return False
#            elif param_opts[ml][k] is float:
#                try:
#                    if float(v) in param_opts[ml][k]:
#                        return True
#                except: 
#                    return False
#            elif v not in param_opts[ml][k]:
##                 if ml == 'LogisticRegression' and k=='C':
##                     print('eliminating',params,'for',ml,'with',k,':',v)
#                    return False
#    return True
#
#mask = [valid_param_combo(row['classifier'],row['parameters']) for _, row in data.iterrows()]
#mask = mask | data.classifier=='LogisticRegression'
#data_filtered = data.loc[mask]
#
## data_filtered = data.loc[lambda x: valid_param_combo(i['classifier'],i['parameters']) for i in x]


In [None]:
for ml, df_g in data.groupby('classifier'):
    print('example parameters for ',ml,'(',len(df_g['parameters'].apply(str).unique()),'):',
           df_g['parameters'].apply(str).unique()[0])

# how many winners of each algorithm are there? (winning = < 1% away from best score)

In [None]:
winner = {}
winner_datasets = {}
import numpy as np
for d, df in data.groupby('dataset'):
    df['bal_accuracy'] = df['bal_accuracy'].apply(pd.to_numeric)
    best_score = df['bal_accuracy'].max()
    for clf, dfg in df.groupby('classifier'):
        wins = np.any((best_score - dfg.bal_accuracy)/best_score <= 0.01)
        if wins:
            if clf in winner:
                winner[clf] += 1
                winner_datasets[clf].append(d)
            else:
                winner[clf] = 1
                winner_datasets[clf] = [d]
import matplotlib.pyplot as plt
%matplotlib inline
print(winner)
print(winner_datasets)
plt.bar(list(winner.keys()),list(winner.values()))
plt.xticks(rotation=90)               

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
print(winner)
print(winner_datasets)
plt.bar(list(winner.keys()),list(winner.values()))
plt.xticks(rotation=90)

In [None]:
import itertools
other_winners = [d for d in [da for k,da in winner_datasets.items() if k != 'GradientBoostingClassifier' ]]
other_winners = list(itertools.chain.from_iterable(other_winners))
print(other_winners) 
only_gbc = [d for d in winner_datasets['GradientBoostingClassifier'] 
            if d not in other_winners]
print(only_gbc)
len(only_gbc)

# remove datasets where GBC is the only winner

In [None]:
data = data[~data.dataset.isin(only_gbc)]
print('# datasets left:',len(data.dataset.unique()))

In [None]:
print(data[data['classifier']=='KNeighborsClassifier']['parameters'].apply(str).unique())

In [None]:
data_filtered.head()

In [None]:
print('data_filtered shape: ', data_filtered.shape)
print('datasets (',len(data_filtered['dataset'].unique()),')')
print('classifiers (',len(data_filtered['classifier'].unique()),'):',data_filtered['classifier'].unique())
for ml, df_g in data_filtered.groupby('classifier'):
    print('parameters for ',ml,'(',len(df_g['parameters'].apply(str).unique()),'):',
           df_g['parameters'].apply(str).unique()[:5])

In [None]:
print('data_filtered shape: ', data_filtered.shape)
print('datasets (',len(data_filtered['dataset'].unique()),')')
print('classifiers (',len(data_filtered['classifier'].unique()),'):',data_filtered['classifier'].unique())
for ml, df_g in data_filtered.groupby('classifier'):
    print('parameters for ',ml,'(',len(df_g['parameters'].apply(str).unique()),'):',
           df_g['parameters'].apply(str).unique()[:5])

# remove big datasets

In [None]:
data_filtered = data
#datasets to remove:
big_datasets = ['poker', 'kddcup', 'sleep', 'fars', 'mnist', 'connect-4', 'shuttle', 'adult', 'krkopt', 
                'letter', 'magic', 'nursery', 'pendigits', 'coil2000', 'agaricus-lepiota','optdigits']
mask = np.array([d not in big_datasets for d in data_filtered['dataset'].values])
clean_data = data_filtered.loc[mask,:]
clean_data.groupby('dataset').count()
print(len(clean_data['dataset'].unique()),'datasets left')

# remove alg-params that don't cover all datasets

In [None]:
clean_data.loc[:, 'algorithm-parameters'] = (                                             
                                       clean_data['classifier'].values + '|' +                     
                                       clean_data['parameters'].apply(str).values) 
all_datasets = np.unique(clean_data['dataset'].values)
nd = len(all_datasets)
algp_toremove = []
for algp, group in clean_data.groupby('algorithm-parameters'):
    if (group['dataset'].count()<nd):
        #print(algp, '\n\t is missing results for', [d for d in all_datasets if d not in np.unique(group['dataset'])])
        #print('removing',algp)
        algp_toremove.append(algp)
        
mask = np.array([ap not in algp_toremove for ap in clean_data['algorithm-parameters'].values])
print('removing',np.sum(~mask),'algorithm-parameter combinations')
clean_data = clean_data.loc[mask,:]
print('new size:',len(clean_data))
clean_data.drop('algorithm-parameters',axis=1,inplace=True)

clean_data.groupby('classifier').count()

In [None]:
winner = {}
winner_datasets = {}
import numpy as np
for d, df in clean_data.groupby('dataset'):
    df['bal_accuracy'] = df['bal_accuracy'].apply(pd.to_numeric)
    best_score = df['bal_accuracy'].max()
    for clf, dfg in df.groupby('classifier'):
        wins = np.any((best_score - dfg.bal_accuracy)/best_score <= 0.01)
        if wins:
            if clf in winner:
                winner[clf] += 1
                winner_datasets[clf].append(d)
            else:
                winner[clf] = 1
                winner_datasets[clf] = [d]
import matplotlib.pyplot as plt
%matplotlib inline
print(winner)
print(winner_datasets)
plt.bar(list(winner.keys()),list(winner.values()))
plt.xticks(rotation=90)               

In [None]:
for ml, df_g in clean_data.groupby('classifier'):
    print('example parameters for ',ml,'(',len(df_g['parameters'].apply(str).unique()),'):',
           df_g['parameters'].apply(str).unique()[0])

# write modified data to file

In [None]:
clean_data.to_csv('sklearn-benchmark5-data-mock_experiment.tsv.gz',compression='gzip',index=False,sep='\t',
                  header=['dataset',
                                 'algorithm',
                                 'parameters',
                                 'accuracy', 
                                 'macrof1',
                                 'bal_accuracy'])