Author: Kyle Nickerson

# Intro  
This notebook has code to try and fit a variety of ml classifiers to pmlb datasets. You can change which classifiers are used, and the criteria for which datasets are included

In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, GridSearchCV

import pandas as pd

from pmlb import fetch_data, classification_dataset_names

### Overview of datasets in pmlb

In [4]:
data_df = pd.read_csv("data_df.csv")

In [5]:
data_df

Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
0,1027_ESL,488,4,0,0,4,continuous,9.0,0.099363,regression
1,1028_SWD,1000,10,0,0,10,continuous,4.0,0.108291,regression
2,1029_LEV,1000,4,0,0,4,continuous,5.0,0.111245,regression
3,1030_ERA,1000,4,0,0,4,continuous,9.0,0.031251,regression
4,1089_USCrime,47,13,0,0,13,continuous,42.0,0.002970,regression
...,...,...,...,...,...,...,...,...,...,...
279,wine_quality_red,1599,11,0,0,11,categorical,6.0,0.228804,classification
280,wine_quality_white,4898,11,0,0,11,categorical,7.0,0.211974,classification
281,wine_recognition,178,13,0,2,11,categorical,3.0,0.012530,classification
282,xd6,973,9,9,0,0,categorical,2.0,0.114332,classification


#### Set classifiers & parameters to try

In [7]:
all_clfs = [(RandomForestClassifier(), {"n_estimators": [100, 500], "min_samples_split": [2, 4, 8], "max_features": ["auto", "sqrt", "log2"]}),
# (GradientBoostingClassifier(), {"learning_rate": [0.05, 0.1, 0.2], "min_samples_split": [2, 4, 8], "max_features": ["auto", "sqrt", "log2"]}),
 (KNeighborsClassifier(), {"n_neighbors": [3, 5, 9, 15], "weights": ["uniform", "distance"]}),
  (LogisticRegression(max_iter=1000), { "penalty": ["l1", "l2", "elasticnet", "none"], "C": [0.5, 0.75, 1.0, 1.5, 2.0]}),
 (GaussianNB(), { "var_smoothing": [1e-5, 1e-9, 1e-13]}),
 (SVC(cache_size=800, max_iter=10_000), {"C": [0.5, 0.75, 1.0, 1.5, 2.0], "kernel": ["linear", "poly", "rbf", "sigmoid"]})
]
all_clfs

[(RandomForestClassifier(),
  {'n_estimators': [100, 500],
   'min_samples_split': [2, 4, 8],
   'max_features': ['auto', 'sqrt', 'log2']}),
 (KNeighborsClassifier(),
  {'n_neighbors': [3, 5, 9, 15], 'weights': ['uniform', 'distance']}),
 (LogisticRegression(max_iter=1000),
  {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
   'C': [0.5, 0.75, 1.0, 1.5, 2.0]}),
 (GaussianNB(), {'var_smoothing': [1e-05, 1e-09, 1e-13]}),
 (SVC(cache_size=800, max_iter=10000),
  {'C': [0.5, 0.75, 1.0, 1.5, 2.0],
   'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})]

## Set up dataset filters
Settings here can be used to select a subset of the datasets which meet specific criteria

In [8]:
min_samples = 100
max_samples= 5_000

min_features = 3
max_features = 1000

bin_feats_ok = True  # use datasets with binary features ?
cat_feats_ok = True  # use datasets with categorical features ?
cont_feats_ok = True # use datasets with continous features ?

use_reg = False  # use datasets with regression problems?
use_clas = True  # use datasets with classification problems?

min_classes = 2 # number of classes in dataset (for classification)
max_classes = 2

### Filter datasets

In [9]:
conds = data_df["n_instances"] >= min_samples
conds = conds & (data_df["n_instances"] <= max_samples)

conds = conds & (data_df["n_features"] >= min_features)
conds = conds & (data_df["n_features"] <= max_features)

feat_type_info = [
    ("n_binary_features", bin_feats_ok),
    ("n_categorical_features", cat_feats_ok),
    ("n_continuous_features", cont_feats_ok),
]

for col_name, is_ok in feat_type_info:
    if not is_ok:
        conds = conds & (data_df[col_name] == 0)

task_info = [
    ("regression", use_reg),
    ("classification", use_clas),    
]

for task, use in task_info:
    if not use:
        conds = conds & (data_df["task"] != task)    
        
        
conds = conds & (data_df["n_classes"] >= min_classes)
conds = conds & (data_df["n_classes"] <= max_classes)  
    
print(f"{sum(conds)} selected out of {len(conds)}")

69 selected out of 284


In [22]:
df = data_df[conds]
df

Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
120,GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_ED...,1600,1000,31,969,0,categorical,2.0,0.000000,classification
121,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,1600,20,0,20,0,categorical,2.0,0.000000,classification
122,GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1,1600,20,2,18,0,categorical,2.0,0.000000,classification
123,GAMETES_Epistasis_3_Way_20atts_0.2H_EDM_1_1,1600,20,0,20,0,categorical,2.0,0.000000,classification
124,GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...,1600,20,1,19,0,categorical,2.0,0.000000,classification
...,...,...,...,...,...,...,...,...,...,...
269,tic_tac_toe,958,9,0,9,0,categorical,2.0,0.094181,classification
271,tokyo1,959,44,0,0,44,categorical,2.0,0.077515,classification
274,vote,435,16,0,16,0,categorical,2.0,0.051795,classification
278,wdbc,569,30,0,0,30,categorical,2.0,0.064940,classification


In [12]:
# ds_name: (short_name, index)
ds_order = { 'breast': ("Breast", 0),
             'GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_EDM_1_1': ("GAMETES_Epistasis", 4),
             'Hill_Valley_without_noise': ("HV_without_noise", 2),
             'Hill_Valley_with_noise': ("HV_with_noise", 3),
             'monk2': ("Monk2", 1),
             'parity5+5': ("Parity5+5", 5), 
           }

ds_order

{'breast': ('Breast', 0),
 'GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_EDM_1_1': ('GAMETES_Epistasis',
  4),
 'Hill_Valley_without_noise': ('HV_without_noise', 2),
 'Hill_Valley_with_noise': ('HV_with_noise', 3),
 'monk2': ('Monk2', 1),
 'parity5+5': ('Parity5+5', 5)}

In [13]:
len(ds_order)

6

In [14]:
def get_ordered_names(dso, full=False):
    x = [0]*len(dso)
    for fname, (sname, i) in dso.items():
        if full:
            x[i] = fname
        else:
            x[i] = sname
            
    return x

datasets = get_ordered_names(ds_order, full=True)
datasets

['breast',
 'monk2',
 'Hill_Valley_without_noise',
 'Hill_Valley_with_noise',
 'GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_EDM_1_1',
 'parity5+5']

### Try fitting a variety of classifiers to the filtered datasets

In [16]:
from sklearn.metrics import balanced_accuracy_score, make_scorer

use_p = 1   # if less than 1, each dataset is included with this probability (this lets you randomly select subset of the datasets)
n_jobs = 5
metric = balanced_accuracy_score

cv_scores = {}
test_scores = {}
training_scores = {}
ds_names = []

def score_grid_clf(clf, x, y):
    return metric(y, clf.best_estimator_.predict(x))

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    for i, ds in enumerate(df.dataset):
        if ds in datasets:   # only run on specific subset of datasets
#         if np.random.random() < use_p:

            print("Running with dataset =", ds, "({} of {})".format(i, len(df.dataset)))
            ds_names.append(ds)
            display(df[df["dataset"]==ds])
            print()
            X, y = fetch_data(ds, return_X_y=True)
            train_X, test_X, train_y, test_y = train_test_split(X, y)


            for clf_info in all_clfs:
                clf = GridSearchCV(*clf_info, scoring=make_scorer(metric), n_jobs=n_jobs)
                clf_name = str(clf.get_params()['estimator']).strip("()")
                print("Fitting", clf_name, end="\t")
                clf.fit(train_X, train_y)     

                resdf = pd.DataFrame(clf.cv_results_) 

                took = (sum(clf.cv_results_["mean_fit_time"])+ sum(clf.cv_results_["mean_score_time"]))*5/n_jobs
                print("Took: {:.2f} secs".format(took), end="\t")
                

                if clf_name not in cv_scores.keys():
                    cv_scores[clf_name] = []
                
                if clf_name not in test_scores.keys():
                    test_scores[clf_name] = []
                    
                if clf_name not in training_scores.keys():   
                    training_scores[clf_name] = []
                    
                cv_acc = np.mean(clf.cv_results_["mean_test_score"])
#                 test_acc = clf.best_estimator_.score(test_X, test_y)
#                 tr_acc =  clf.best_estimator_.score(train_X, train_y)
                test_acc = score_grid_clf(clf, test_X, test_y)
                tr_acc =  score_grid_clf(clf, train_X, train_y)
                
                                
                cv_scores[clf_name].append(cv_acc)
                test_scores[clf_name].append(test_acc)
                training_scores[clf_name].append(tr_acc)
                
                
                print("Training acc = {:.2f}".format(tr_acc), "\tCV acc = {:.2f}".format(cv_acc), "\tTest acc = {:.2f}".format(test_acc))
                
                print("\n")
        
     

Running with dataset = GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_EDM_1_1 (0 of 69)


Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
120,GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_ED...,1600,1000,31,969,0,categorical,2.0,0.0,classification



Fitting RandomForestClassifier	Took: 32.92 secs	Training acc = 1.00 	CV acc = 0.50 	Test acc = 0.53


Fitting KNeighborsClassifier	Took: 5.11 secs	Training acc = 0.69 	CV acc = 0.49 	Test acc = 0.50


Fitting LogisticRegression(max_iter=1000	Took: 4.47 secs	Training acc = 1.00 	CV acc = nan 	Test acc = 0.49


Fitting GaussianNB	Took: 0.19 secs	Training acc = 0.75 	CV acc = 0.51 	Test acc = 0.48


Fitting SVC(cache_size=800, max_iter=10000	Took: 34.65 secs	Training acc = 1.00 	CV acc = 0.51 	Test acc = 0.47


Running with dataset = Hill_Valley_with_noise (6 of 69)


Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
126,Hill_Valley_with_noise,1212,100,0,0,100,categorical,2.0,0.0,classification



Fitting RandomForestClassifier	Took: 36.44 secs	Training acc = 1.00 	CV acc = 0.53 	Test acc = 0.59


Fitting KNeighborsClassifier	Took: 0.20 secs	Training acc = 1.00 	CV acc = 0.50 	Test acc = 0.49


Fitting LogisticRegression(max_iter=1000	Took: 13.05 secs	Training acc = 0.99 	CV acc = nan 	Test acc = 0.96


Fitting GaussianNB	Took: 0.02 secs	Training acc = 0.50 	CV acc = 0.50 	Test acc = 0.48


Fitting SVC(cache_size=800, max_iter=10000	Took: 1.44 secs	Training acc = 0.85 	CV acc = 0.56 	Test acc = 0.88


Running with dataset = Hill_Valley_without_noise (7 of 69)


Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
127,Hill_Valley_without_noise,1212,100,0,0,100,categorical,2.0,9.8e-05,classification



Fitting RandomForestClassifier	Took: 38.15 secs	Training acc = 1.00 	CV acc = 0.57 	Test acc = 0.57


Fitting KNeighborsClassifier	Took: 0.20 secs	Training acc = 1.00 	CV acc = 0.56 	Test acc = 0.57


Fitting LogisticRegression(max_iter=1000	Took: 1.55 secs	Training acc = 1.00 	CV acc = nan 	Test acc = 1.00


Fitting GaussianNB	Took: 0.01 secs	Training acc = 0.51 	CV acc = 0.51 	Test acc = 0.50


Fitting SVC(cache_size=800, max_iter=10000	Took: 1.39 secs	Training acc = 0.83 	CV acc = 0.59 	Test acc = 0.84


Running with dataset = breast (16 of 69)


Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
157,breast,699,10,0,1,9,categorical,2.0,0.096375,classification



Fitting RandomForestClassifier	Took: 11.04 secs	Training acc = 0.99 	CV acc = 0.96 	Test acc = 0.99


Fitting KNeighborsClassifier	Took: 0.05 secs	Training acc = 1.00 	CV acc = 0.57 	Test acc = 0.52


Fitting LogisticRegression(max_iter=1000	Took: 0.06 secs	Training acc = 0.50 	CV acc = nan 	Test acc = 0.50


Fitting GaussianNB	Took: 0.01 secs	Training acc = 0.96 	CV acc = 0.70 	Test acc = 0.96


Fitting SVC(cache_size=800, max_iter=10000	Took: 0.30 secs	Training acc = 0.50 	CV acc = 0.50 	Test acc = 0.50


Running with dataset = monk2 (51 of 69)


Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
230,monk2,601,6,2,4,0,categorical,2.0,0.098895,classification



Fitting RandomForestClassifier	Took: 11.27 secs	Training acc = 1.00 	CV acc = 0.77 	Test acc = 0.84


Fitting KNeighborsClassifier	Took: 0.04 secs	Training acc = 1.00 	CV acc = 0.61 	Test acc = 0.72


Fitting LogisticRegression(max_iter=1000	Took: 0.11 secs	Training acc = 0.50 	CV acc = nan 	Test acc = 0.50


Fitting GaussianNB	Took: 0.01 secs	Training acc = 0.50 	CV acc = 0.50 	Test acc = 0.50


Fitting SVC(cache_size=800, max_iter=10000	Took: 0.23 secs	Training acc = 0.69 	CV acc = 0.51 	Test acc = 0.59


Running with dataset = parity5+5 (54 of 69)


Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
240,parity5+5,1124,10,10,0,0,categorical,2.0,7.9e-05,classification



Fitting RandomForestClassifier	Took: 12.60 secs	Training acc = 1.00 	CV acc = 0.49 	Test acc = 0.60


Fitting KNeighborsClassifier	Took: 0.08 secs	Training acc = 1.00 	CV acc = 0.49 	Test acc = 0.56


Fitting LogisticRegression(max_iter=1000	Took: 0.06 secs	Training acc = 0.52 	CV acc = nan 	Test acc = 0.45


Fitting GaussianNB	Took: 0.01 secs	Training acc = 0.52 	CV acc = 0.43 	Test acc = 0.45


Fitting SVC(cache_size=800, max_iter=10000	Took: 0.47 secs	Training acc = 0.48 	CV acc = 0.43 	Test acc = 0.48




#### View results
Note: This might only shows the results for the datasets I was working with, but the results for all should be in 'test_scores'

In [26]:
result = pd.DataFrame.from_dict(test_scores, ) # df.dataset
result["dataset"] = datasets
result.set_index("dataset")

Unnamed: 0_level_0,RandomForestClassifier,KNeighborsClassifier,LogisticRegression(max_iter=1000,GaussianNB,"SVC(cache_size=800, max_iter=10000"
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
breast,0.525303,0.497975,0.490199,0.477823,0.472422
monk2,0.593164,0.494263,0.963768,0.484011,0.884042
Hill_Valley_without_noise,0.570949,0.574414,1.0,0.501177,0.839144
Hill_Valley_with_noise,0.986609,0.522345,0.5,0.964953,0.5
GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_EDM_1_1,0.836786,0.724337,0.5,0.5,0.594625
parity5+5,0.596794,0.559593,0.452926,0.452926,0.475242
