In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from warnings import filterwarnings

from DatasetsEvaluator import DatasetsEvaluator as de

filterwarnings('ignore')

## Example finding a single file

In [2]:
datasets_tester = de.DatasetsTester()
matching_datasets = datasets_tester.find_by_name(['pol'], "classification")
matching_datasets

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
201,201,pol,1,1,active,ARFF,,,,0.0,49.0,15000.0,0.0,0.0,49.0,0.0
722,722,pol,2,2,active,ARFF,9959.0,2.0,5041.0,2.0,49.0,15000.0,0.0,0.0,48.0,1.0


## Example collecting all datasets meeting some specified criteria

In [3]:
matching_datasets = datasets_tester.find_datasets( 
    problem_type = "classification",
    min_num_classes = 2,
    max_num_classes = 20,
    min_num_minority_class = 5,
    max_num_minority_class = np.inf,
    min_num_features = 0,
    max_num_features = np.inf,
    min_num_instances = 500,
    max_num_instances = 5_000,
    min_num_numeric_features = 2,
    max_num_numeric_features = 50,
    min_num_categorical_features=0,
    max_num_categorical_features=50)

print("Number matching datasets found:", len(matching_datasets))
display(matching_datasets.head())

Number matching datasets found: 237


Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0
15,15,breast-w,1,1,active,ARFF,458.0,2.0,241.0,2.0,10.0,699.0,16.0,16.0,9.0,1.0
18,18,mfeat-morphological,1,1,active,ARFF,200.0,10.0,200.0,10.0,7.0,2000.0,0.0,0.0,6.0,1.0
22,22,mfeat-zernike,1,1,active,ARFF,200.0,10.0,200.0,10.0,48.0,2000.0,0.0,0.0,47.0,1.0


## Example collecting the datasets specified above and running classification tests

In [4]:
# After viewing the matching datasets, it's possible to collect all, or some subset of these. The following
# code collects 5 matching datasets.
# Note: some datasets may have errors loading. 
# Note: As this uses the default False for keep_duplicated_names, some datasets may be removed.
datasets_tester.collect_data(max_num_datasets_used=5, method_pick_sets='pick_first', preview_data=False) 

# The following code undoes the previous collection and collects all matching datasets. 
# This is currently commented out, as it takes longer to execute. 
# datasets_tester.collect_data(max_num_datasets_used=-1, preview_data=False)

dt_1 = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=6, random_state=0)
dt_2 = tree.DecisionTreeClassifier(min_samples_split=25, max_depth=5, random_state=0)
knn_1 = KNeighborsClassifier(n_neighbors=5)
knn_2 = KNeighborsClassifier(n_neighbors=10)

summary_df = datasets_tester.run_tests(estimators_arr = [
        ("Decision Tree", "Original Features", "min_samples_split=50, max_depth=6", dt_1),
        ("Decision Tree", "Original Features", "min_samples_split=25, max_depth=5", dt_2),
        ("kNN", "Original Features", "n_neighbors=5", knn_1),
        ("kNN", "Original Features", "n_neighbors=10", knn_2)]) 

display(summary_df)

Loading dataset from openml: 0, id: 15, name: breast-w
Loading dataset from openml: 1, id: 29, name: credit-approval
Loading dataset from openml: 2, id: 31, name: credit-g
Loading dataset from openml: 3, id: 37, name: diabetes
Loading dataset from openml: 4, id: 38, name: sick

Running test on 5 datastets
Running tests on dataset: breast-w
	Running tests with model: Decision Tree (Original Features), (min_samples_split=50, max_depth=6)
	Running tests with model: Decision Tree (Original Features), (min_samples_split=25, max_depth=5)
	Running tests with model: kNN (Original Features), (n_neighbors=5)
	Running tests with model: kNN (Original Features), (n_neighbors=10)
Running tests on dataset: credit-approval
	Running tests with model: Decision Tree (Original Features), (min_samples_split=50, max_depth=6)
	Running tests with model: Decision Tree (Original Features), (min_samples_split=25, max_depth=5)
	Running tests with model: kNN (Original Features), (n_neighbors=5)
	Running tests with

Unnamed: 0,index,Dataset,Dataset Version,Model,Feature Engineering Description,Hyperparameter Description,Avg f1_macro,Std dev between folds,Train-Test Gap,# Columns,Model Complexity,Fit Time
0,0,breast-w,1,Decision Tree,Original Features,"min_samples_split=50, max_depth=6",0.930688,0.031191,0.025364,10,19.4,0.003183
1,0,breast-w,1,Decision Tree,Original Features,"min_samples_split=25, max_depth=5",0.928311,0.048087,0.035165,10,18.6,0.0
2,0,breast-w,1,kNN,Original Features,n_neighbors=5,0.269224,0.118357,0.690275,10,0.0,0.010577
3,0,breast-w,1,kNN,Original Features,n_neighbors=10,0.256489,0.117787,0.653372,10,0.0,0.0
4,0,credit-approval,1,Decision Tree,Original Features,"min_samples_split=50, max_depth=6",0.644976,0.256457,0.311579,57,25.8,0.003
5,0,credit-approval,1,Decision Tree,Original Features,"min_samples_split=25, max_depth=5",0.644976,0.256457,0.319273,57,25.0,0.003126
6,0,credit-approval,1,kNN,Original Features,n_neighbors=5,0.717754,0.155466,0.17241,57,0.0,0.003076
7,0,credit-approval,1,kNN,Original Features,n_neighbors=10,0.69352,0.143793,0.167491,57,0.0,0.003126
8,0,credit-g,1,Decision Tree,Original Features,"min_samples_split=50, max_depth=6",0.587075,0.053987,0.124751,77,36.6,0.005213
9,0,credit-g,1,Decision Tree,Original Features,"min_samples_split=25, max_depth=5",0.596116,0.053187,0.108552,77,39.8,0.0


## Example collecting regression datasets and performing regression tests on these

In [6]:
datasets_tester = de.DatasetsTester()

# This example uses the default settings to select the datasets, then displays the results. 
# In the subsequent cell, we choose to collect a subset of these. 
matching_datasets = datasets_tester.find_datasets(problem_type = "regression",)
print("Number matching datasets found:", len(matching_datasets))
display(matching_datasets.head())

Number matching datasets found: 97


Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
209,209,quake,1,1,active,ARFF,,,,0.0,4.0,2178.0,0.0,0.0,4.0,0.0
223,223,stock,1,1,active,ARFF,,,,0.0,10.0,950.0,0.0,0.0,10.0,0.0
482,482,arsenic-male-bladder,1,2,active,ARFF,,43.0,,0.0,5.0,559.0,0.0,0.0,4.0,1.0
494,494,analcatdata_hiroshima,1,2,active,ARFF,,1.0,,0.0,3.0,649.0,0.0,0.0,2.0,1.0
504,504,analcatdata_supreme,1,2,active,ARFF,,,,0.0,8.0,4052.0,0.0,0.0,8.0,0.0


In [9]:
dt = tree.DecisionTreeRegressor(min_samples_split=50, max_depth=5, random_state=0)
knn = KNeighborsRegressor(n_neighbors=10)

datasets_tester.collect_data(max_num_datasets_used=10)

# This provides an example using some non-default parameters. 
summary_df = datasets_tester.run_tests(estimators_arr = [
                                        ("Decision Tree", "Original Features", "Default", dt),
                                        ("kNN", "Original Features", "Default", knn)],
                                       num_cv_folds=3,
                                       scoring_metric='r2',
                                       show_warnings=True) 

display(summary_df)

Loading dataset from openml: 0, id: 223, name: stock
Loading dataset from openml: 1, id: 482, name: arsenic-male-bladder
Loading dataset from openml: 2, id: 494, name: analcatdata_hiroshima
Loading dataset from openml: 3, id: 504, name: analcatdata_supreme
Loading dataset from openml: 4, id: 507, name: space_ga
Loading dataset from openml: 5, id: 512, name: balloon
Loading dataset from openml: 6, id: 513, name: arsenic-female-lung
Loading dataset from openml: 7, id: 516, name: pbcseq
Loading dataset from openml: 8, id: 522, name: pm10
Loading dataset from openml: 9, id: 529, name: pollen

Running test on 10 datastets
Running tests on dataset: stock
	Running tests with model: Decision Tree (Original Features), (Default)
	Running tests with model: kNN (Original Features), (Default)
Running tests on dataset: arsenic-male-bladder
	Running tests with model: Decision Tree (Original Features), (Default)
	Running tests with model: kNN (Original Features), (Default)
Running tests on dataset: an

Unnamed: 0,index,Dataset,Dataset Version,Model,Feature Engineering Description,Hyperparameter Description,Avg r2,Std dev between folds,Train-Test Gap,# Columns,Model Complexity,Fit Time
0,0,stock,1,Decision Tree,Original Features,Default,-6.059462,1.523434,7.021839,10,33.0,0.004657
1,0,stock,1,kNN,Original Features,Default,-6.17719,1.761161,7.160525,10,0.0,0.002285
2,0,arsenic-male-bladder,1,Decision Tree,Original Features,Default,-0.603088,0.974131,1.032964,4,17.0,0.00521
3,0,arsenic-male-bladder,1,kNN,Original Features,Default,-0.068044,0.069576,0.415249,4,0.0,0.005209
4,0,analcatdata_hiroshima,1,Decision Tree,Original Features,Default,-3.727387,2.793093,4.499434,4,19.0,0.00521
5,0,analcatdata_hiroshima,1,kNN,Original Features,Default,-3.139985,3.512115,4.0781,4,0.0,0.0
6,0,analcatdata_supreme,1,Decision Tree,Original Features,Default,-0.045823,0.388331,0.798055,8,35.0,0.0
7,0,analcatdata_supreme,1,kNN,Original Features,Default,-0.29111,0.496101,0.98538,8,0.0,0.014211
8,0,space_ga,1,Decision Tree,Original Features,Default,0.276183,0.053406,0.379849,7,51.0,0.005208
9,0,space_ga,1,kNN,Original Features,Default,0.320651,0.027241,0.288462,7,0.0,0.0


## Example wrting to and reading from local cache

In [10]:
cache_folder = "c:\\dataset_cache"

# This will read from openml.org
datasets_tester.collect_data(max_num_datasets_used=10, preview_data=False, save_local_cache=True, path_local_cache=cache_folder)

# This will read from the local cache
datasets_tester.collect_data(max_num_datasets_used=10, preview_data=False, check_local_cache=True, path_local_cache=cache_folder)

  and should_run_async(code)


Loading dataset from openml: 0, id: 223, name: stock
Loading dataset from openml: 1, id: 482, name: arsenic-male-bladder
Loading dataset from openml: 2, id: 494, name: analcatdata_hiroshima
Loading dataset from openml: 3, id: 504, name: analcatdata_supreme
Loading dataset from openml: 4, id: 507, name: space_ga
Loading dataset from openml: 5, id: 512, name: balloon
Loading dataset from openml: 6, id: 513, name: arsenic-female-lung
Loading dataset from openml: 7, id: 516, name: pbcseq
Loading dataset from openml: 8, id: 522, name: pm10
Loading dataset from openml: 9, id: 529, name: pollen
Reading from local cache: 0, id: 223, name: stock
Reading from local cache: 1, id: 482, name: arsenic-male-bladder
Reading from local cache: 2, id: 494, name: analcatdata_hiroshima
Reading from local cache: 3, id: 504, name: analcatdata_supreme
Reading from local cache: 4, id: 507, name: space_ga
Reading from local cache: 5, id: 512, name: balloon
Reading from local cache: 6, id: 513, name: arsenic-fem

## Example Comparing Two Pipelines

In [12]:
datasets_tester = de.DatasetsTester()
matching_datasets = datasets_tester.find_by_name(['arsenic-male-bladder'], "classification")
datasets_tester.collect_data() 

pipe1 = Pipeline([('scaler', MinMaxScaler()), ('knn_classifier', KNeighborsClassifier())])
pipe2 = Pipeline([('scaler', StandardScaler()), ('knn_classifier', KNeighborsClassifier())])

# This provides an example using some non-default parameters. 
summary_df = datasets_tester.run_tests(estimators_arr = [
                                        ("kNN with MinMaxScaler", "Original Features", "Default", pipe1),
                                        ("kNN with StandardScaler", "Original Features", "Default", pipe2)],
                                       num_cv_folds=3,
                                       show_warnings=True) 

display(summary_df)

Loading dataset from openml: 0, id: 947, name: arsenic-male-bladder

Running test on 1 datastets
Running tests on dataset: arsenic-male-bladder
	Running tests with model: kNN with MinMaxScaler (Original Features), (Default)
	Running tests with model: kNN with StandardScaler (Original Features), (Default)


Unnamed: 0,index,Dataset,Dataset Version,Model,Feature Engineering Description,Hyperparameter Description,Avg f1_macro,Std dev between folds,Train-Test Gap,# Columns,Model Complexity,Fit Time
0,0,arsenic-male-bladder,2,kNN with MinMaxScaler,Original Features,Default,0.524381,0.063724,0.32618,4,0,0.00521
1,0,arsenic-male-bladder,2,kNN with StandardScaler,Original Features,Default,0.544931,0.048746,0.315521,4,0,0.005211
