In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from warnings import filterwarnings
import shutil

# todo: put back!!
#from DatasetsEvaluator.DatasetsEvaluator import DatasetsTester

import sys  
sys.path.insert(0, 'C:\python_projects\DatasetsEvaluator_project\DatasetsEvaluator')
import DatasetsEvaluator as de

filterwarnings('ignore')

In [2]:
cache_folder = "c:\\dataset_cache"
partial_result_folder = "c:\\intermediate_results"

In [3]:
# Generally we wish to delete the partial results folder in order to create it fresh each execution, unless
# this execution is continuing from a previous failure, such as power loss etc.
try:
    shutil.rmtree(partial_result_folder)
except:
    pass

## Example finding a single file

In [4]:
datasets_tester = de.DatasetsTester()
# todo: should be able to remove the de. now
matching_datasets = datasets_tester.find_by_name(['pol'], "classification")
matching_datasets

ConnectionError: HTTPSConnectionPool(host='www.openml.org', port=443): Max retries exceeded with url: /api/v1/xml/data/list/limit/10000/offset/0 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002AA6BD446A0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

## Example collecting all datasets meeting some specified criteria

In [None]:
matching_datasets = datasets_tester.find_datasets( 
    problem_type = "classification",
    min_num_classes = 2,
    max_num_classes = 20,
    min_num_minority_class = 5,
    max_num_minority_class = np.inf,
    min_num_features = 0,
    max_num_features = np.inf,
    min_num_instances = 500,
    max_num_instances = 5_000,
    min_num_numeric_features = 2,
    max_num_numeric_features = 50,
    min_num_categorical_features=0,
    max_num_categorical_features=50)

print("Number matching datasets found:", len(matching_datasets))
display(matching_datasets.head())

## Example collecting the datasets specified above and running classification tests

In [None]:
# After viewing the matching datasets, it's possible to collect all, or some subset of these. The following
# code collects 5 matching datasets.
# Note: some datasets may have errors loading. 
# Note: As this uses the default False for keep_duplicated_names, some datasets may be removed.
datasets_tester.collect_data(max_num_datasets_used=5, method_pick_sets='pick_first', preview_data=False) 

# The following code undoes the previous collection and collects all matching datasets. 
# This is currently commented out, as it takes longer to execute. 
# datasets_tester.collect_data(max_num_datasets_used=-1, preview_data=False)

dt_1 = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=6, random_state=0)
dt_2 = tree.DecisionTreeClassifier(min_samples_split=25, max_depth=5, random_state=0)
knn_1 = KNeighborsClassifier(n_neighbors=5)
knn_2 = KNeighborsClassifier(n_neighbors=10)

summary_df = datasets_tester.run_tests(estimators_arr = [
        ("Decision Tree", "Original Features", "min_samples_split=50, max_depth=6", dt_1),
        ("Decision Tree", "Original Features", "min_samples_split=25, max_depth=5", dt_2),
        ("kNN", "Original Features", "n_neighbors=5", knn_1),
        ("kNN", "Original Features", "n_neighbors=10", knn_2)]) 

display(summary_df)

## Example collecting regression datasets and performing regression tests on these

In [None]:
datasets_tester = de.DatasetsTester()

# This example uses the default settings to select the datasets, then displays the results. 
# In the subsequent cell, we choose to collect a subset of these. 
matching_datasets = datasets_tester.find_datasets(problem_type = "regression",)
print("Number matching datasets found:", len(matching_datasets))
display(matching_datasets.head())

In [None]:
dt = tree.DecisionTreeRegressor(min_samples_split=50, max_depth=5, random_state=0)
knn = KNeighborsRegressor(n_neighbors=10)

datasets_tester.collect_data(max_num_datasets_used=10)

# This provides an example using some non-default parameters. 
summary_df = datasets_tester.run_tests(estimators_arr = [
                                        ("Decision Tree", "Original Features", "Default", dt),
                                        ("kNN", "Original Features", "Default", knn)],
                                       num_cv_folds=3,
                                       scoring_metric='r2',
                                       show_warnings=True) 

display(summary_df)

## Example wrting to and reading from local cache

In [None]:
# This will read from openml.org
datasets_tester.collect_data(max_num_datasets_used=10, preview_data=False, save_local_cache=True, path_local_cache=cache_folder)

# This will read from the local cache
datasets_tester.collect_data(max_num_datasets_used=10, preview_data=False, check_local_cache=True, path_local_cache=cache_folder)

## Example Comparing Two Pipelines

In [None]:
datasets_tester = de.DatasetsTester()
matching_datasets = datasets_tester.find_by_name(['arsenic-male-bladder'], "classification")
datasets_tester.collect_data() 

pipe1 = Pipeline([('scaler', MinMaxScaler()), ('knn_classifier', KNeighborsClassifier())])
pipe2 = Pipeline([('scaler', StandardScaler()), ('knn_classifier', KNeighborsClassifier())])

# This provides an example using some non-default parameters. 
summary_df = datasets_tester.run_tests(estimators_arr = [
                                        ("kNN with MinMaxScaler", "Original Features", "Default", pipe1),
                                        ("kNN with StandardScaler", "Original Features", "Default", pipe2)],
                                       num_cv_folds=3,
                                       show_warnings=True) 

display(summary_df)

## Example Resuming after a failure

In [None]:
# run_tests() has the ability to save intermediate results to a folder and continue after a failure from a given
# start point. This example simulates this by setting ending_point in the first call to run_tests() and
# setting starting_point in the 2nd call. 

try:
    shutil.rmtree(partial_result_folder)
except:
    pass

datasets_tester = de.DatasetsTester()
matching_datasets = datasets_tester.find_by_name(['pol', 'credit-g'], "classification")

datasets_tester.collect_data(save_local_cache=True, check_local_cache=True, path_local_cache=cache_folder) 
ds = datasets_tester.get_dataset_collection()

summary_df = datasets_tester.run_tests(estimators_arr = [("dt", "Original Features", "max_depth=2", tree.DecisionTreeClassifier(max_depth=2)),
                                                         ("dt", "Original Features", "max_depth=3", tree.DecisionTreeClassifier(max_depth=3))],
                                       num_cv_folds=3,
                                       show_warnings=True,
                                       ending_point=1,
                                       partial_result_folder=partial_result_folder)

summary_df = datasets_tester.run_tests(estimators_arr = [("dt", "Original Features", "max_depth=2", tree.DecisionTreeClassifier(max_depth=2)),
                                                         ("dt", "Original Features", "max_depth=3", tree.DecisionTreeClassifier(max_depth=3))],
                                       num_cv_folds=3,
                                       show_warnings=True,
                                       starting_point=1,
                                       partial_result_folder=partial_result_folder)

summary_df