In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from warnings import filterwarnings

from DatasetsEvaluator import DatasetsEvaluator as de

filterwarnings('ignore')

## Example finding a single file

In [4]:
datasets_tester = de.DatasetsTester()
matching_datasets = datasets_tester.find_by_name(['pol'], "classification")
matching_datasets

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
201,201,pol,1,1,active,ARFF,,,,0.0,49.0,15000.0,0.0,0.0,49.0,0.0
722,722,pol,2,2,active,ARFF,9959.0,2.0,5041.0,2.0,49.0,15000.0,0.0,0.0,48.0,1.0


## Example collecting all datasets meeting some specified criteria

In [None]:
matching_datasets = datasets_tester.find_datasets( 
    problem_type = "classification",
    min_num_classes = 2,
    max_num_classes = 20,
    min_num_minority_class = 5,
    max_num_minority_class = np.inf,
    min_num_features = 0,
    max_num_features = np.inf,
    min_num_instances = 500,
    max_num_instances = 5_000,
    min_num_numeric_features = 2,
    max_num_numeric_features = 50,
    min_num_categorical_features=0,
    max_num_categorical_features=50)

print("Number matching datasets found:", len(matching_datasets))
display(matching_datasets.head())

## Example collecting the datasets specified above and running classification tests

In [None]:
# After viewing the matching datasets, it's possible to collect all, or some subset of these. The following
# code collects 5 matching datasets.
# Note: some datasets may have errors loading. 
# Note: As this uses the default False for keep_duplicated_names, some datasets may be removed.
datasets_tester.collect_data(max_num_datasets_used=5, method_pick_sets='pick_first', preview_data=False) 

# The following code undoes the previous collection and collects all matching datasets. 
# This is currently commented out, as it takes longer to execute. 
# datasets_tester.collect_data(max_num_datasets_used=-1, preview_data=False)

dt_1 = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=6, random_state=0)
dt_2 = tree.DecisionTreeClassifier(min_samples_split=25, max_depth=5, random_state=0)
knn_1 = KNeighborsClassifier(n_neighbors=5)
knn_2 = KNeighborsClassifier(n_neighbors=10)

summary_df = datasets_tester.run_tests(estimators_arr = [
        ("Decision Tree", "Original Features", "min_samples_split=50, max_depth=6", dt_1),
        ("Decision Tree", "Original Features", "min_samples_split=25, max_depth=5", dt_2),
        ("kNN", "Original Features", "n_neighbors=5", knn_1),
        ("kNN", "Original Features", "n_neighbors=10", knn_2)]) 

display(summary_df)

## Example collecting regression datasets and performing regression tests on these

In [2]:
datasets_tester = DatasetsTester()

# This example uses the default settings to select the datasets, then displays the results. 
# In the subsequent cell, we choose to collect a subset of these. 
matching_datasets = datasets_tester.find_datasets(problem_type = "regression",)
print("Number matching datasets found:", len(matching_datasets))
display(matching_datasets.head())

Number matching datasets found: 96


Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
209,209,quake,1,1,active,ARFF,,,,0.0,4.0,2178.0,0.0,0.0,4.0,0.0
223,223,stock,1,1,active,ARFF,,,,0.0,10.0,950.0,0.0,0.0,10.0,0.0
482,482,arsenic-male-bladder,1,2,active,ARFF,,43.0,,0.0,5.0,559.0,0.0,0.0,4.0,1.0
494,494,analcatdata_hiroshima,1,2,active,ARFF,,1.0,,0.0,3.0,649.0,0.0,0.0,2.0,1.0
504,504,analcatdata_supreme,1,2,active,ARFF,,,,0.0,8.0,4052.0,0.0,0.0,8.0,0.0


In [None]:
dt = tree.DecisionTreeRegressor(min_samples_split=50, max_depth=5, random_state=0)
knn = KNeighborsRegressor(n_neighbors=10)

datasets_tester.collect_data(max_num_datasets_used=10)

# This provides an example using some non-default parameters. 
summary_df = datasets_tester.run_tests(estimators_arr = [
                                        ("Decision Tree", "Original Features", "Default", dt),
                                        ("kNN", "Original Features", "Default", knn)],
                                       num_cv_folds=3,
                                       scoring_metric='r2',
                                       show_warnings=True) 

display(summary_df)

## Example wrting to and reading from local cache

In [3]:
cache_folder = "c:\\dataset_cache"

# This will read from openml.org
datasets_tester.collect_data(max_num_datasets_used=10, preview_data=False, save_local_cache=True, path_local_cache=cache_folder)

# This will read from the local cache
datasets_tester.collect_data(max_num_datasets_used=10, preview_data=False, check_local_cache=True, path_local_cache=cache_folder)

Loading dataset from openml: 0, id: 223, name: stock
Loading dataset from openml: 1, id: 482, name: arsenic-male-bladder
Loading dataset from openml: 2, id: 494, name: analcatdata_hiroshima
Loading dataset from openml: 3, id: 504, name: analcatdata_supreme
Loading dataset from openml: 4, id: 507, name: space_ga
Loading dataset from openml: 5, id: 512, name: balloon
Loading dataset from openml: 6, id: 513, name: arsenic-female-lung
Loading dataset from openml: 7, id: 516, name: pbcseq
Loading dataset from openml: 8, id: 522, name: pm10
Loading dataset from openml: 9, id: 529, name: pollen
as list:  [False, False, False, False, False, False, False, False, False]
Reading from local cache: 0, id: 223, name: stock
as list:  [True, False, False, False]
Reading from local cache: 1, id: 482, name: arsenic-male-bladder
as list:  [False, True]
Reading from local cache: 2, id: 494, name: analcatdata_hiroshima
as list:  [False, False, False, False, False, False, False]
Reading from local cache: 3,

## Example Comparing Two Pipelines

In [None]:
datasets_tester = DatasetsTester()
matching_datasets = datasets_tester.find_by_name(['arsenic-male-bladder'], "classification")
datasets_tester.collect_data() 

pipe1 = Pipeline([('scaler', MinMaxScaler()), ('knn_classifier', KNeighborsClassifier())])
pipe2 = Pipeline([('scaler', StandardScaler()), ('knn_classifier', KNeighborsClassifier())])

# This provides an example using some non-default parameters. 
summary_df = datasets_tester.run_tests(estimators_arr = [
                                        ("kNN with MinMaxScaler", "Original Features", "Default", pipe1),
                                        ("kNN with StandardScaler", "Original Features", "Default", pipe2)],
                                       num_cv_folds=3,
                                       show_warnings=True) 

display(summary_df)