In [1]:
from sklearn import datasets
import numpy as np

In [2]:
def print_dataset_characteristics(dataset):
    X = dataset.data
    y = dataset.target
    if dataset.task == 'classification':
        n_classes = len(np.unique(dataset.target))
        target_names = dataset.target_names
    else:
        n_classes = type(y[0])
        target_names = dataset.get('target_filename',None)
    print("""{7}
    Data example:
    {0}
    Data type:{1}
    Data shape:{2}
    Feature names:{3}
    Target example:
    {4}
    Target_names:{5}
    Taks:{8} ({6})
    """.format(
        X[0,:], type(X), X.shape,
        dataset.feature_names, y[0], target_names,
        n_classes, dataset.DESCR, dataset.task))

In [12]:
# we must specify:
#     'name':name of the dataset,
#     'task':'classification' or 'regression'
#     'cat_features':list of the categorical features,
#     'cont_features':list of the continuous features,
#     'data':data (features x, not including target),
#     'target':target (y),
#     'test_data':if there is a default data for test, None otherwise,
#     'test_target':if there is a default data for test, None otherwise

# Scikit-learn datasets

In [3]:
# iris
iris = datasets.load_iris(as_frame=True)
iris.update({
    'name':'iris',
    'task':'classification',
    'cat_features':[],
    'cont_features':[0, 1, 2, 3],
#     'data': already specified when loading the dataset from scikit-learn
#     'target': already specified when loading the dataset from scikit-learn
    'test_data':None,
    'test_target':None
})
# diabetes
diabetes = datasets.load_diabetes(as_frame=True)
diabetes.update({
    'name':'diabetes',
    'task':'regression',
    'cat_features':[],
    'cont_features':[i for i in range(10)],
#     'data': already specified when loading the dataset from scikit-learn
#     'target': already specified when loading the dataset from scikit-learn
    'test_data':None,
    'test_target':None
})
# wine
wine = datasets.load_wine(as_frame=True)
wine.update({
    'name':'wine',
    'task':'classification',
    'cat_features':[],
    'cont_features':[i for i in range(13)],
#     'data': already specified when loading the dataset from scikit-learn
#     'target': already specified when loading the dataset from scikit-learn
    'test_data':None,
    'test_target':None
})
# breast cancer
breast_cancer = datasets.load_breast_cancer(as_frame=True)
breast_cancer.update({
    'name':'breast_cancer',
    'task':'classification',
    'cat_features':[],
    'cont_features':[i for i in range(30)],
#     'data': already specified when loading the dataset from scikit-learn
#     'target': already specified when loading the dataset from scikit-learn
    'test_data':None,
    'test_target':None
})
# covtype
covtype = datasets.fetch_covtype(as_frame=True)
covtype.update({
    'name':'covtype',
    'task':'classification',
    'cat_features':[i for i in range(10,54)],
    'cont_features':[i for i in range(10)],
#     'data': already specified when loading the dataset from scikit-learn
#     'target': already specified when loading the dataset from scikit-learn
    'test_data':None,
    'test_target':None
})
# kddcup99_sa
kddcup99_sa = datasets.fetch_kddcup99(subset='SA', as_frame=True)
kddcup99_sa.update({
    'name':'fetch_kddcup99_sa',
    'task':'classification',
    'cat_features':[1, 2, 3, 6, 11, 20, 21],
    'cont_features':[0, 4, 5] + [i for i in range(7, 11)] + [i for i in range(12, 20)] + [i for i in range(22, 40)],
    'data': kddcup99_sa['data'].loc[~kddcup99_sa['target'].isin([b'nmap.', b'pod.', b'portsweep.'])]
    'target': kddcup99_sa['target'].loc[~kddcup99_sa['target'].isin([b'nmap.', b'pod.', b'portsweep.'])]
    'test_data':None,
    'test_target':None
})
# kddcup99_sa
# obs we have a problem in the data, the classes b'nmap.', b'pod.', b'portsweep.' only have one single example,
# so we filter it - > Actually we will not use it for now, because each time we fetch the dataset we select a random
# sample from the total dataset and different cases of infrequent classes must be handle....to do one day
# kddcup99_sa = datasets.fetch_kddcup99(subset='SA', as_frame=True, random_state=SEED)
# kddcup99_sa.update({
#     'name':'fetch_kddcup99_sa',
#     'task':'classification',
#     'cat_features':[1, 2, 3, 6, 11, 20, 21],
#     'cont_features':[0, 4, 5] + [i for i in range(7, 11)] + [i for i in range(12, 20)] + [i for i in range(22, 40)],
#     'data': kddcup99_sa['data'].loc[~kddcup99_sa['target'].isin([b'nmap.', b'pod.', b'portsweep.'])],
#     'target': kddcup99_sa['target'].loc[~kddcup99_sa['target'].isin([b'nmap.', b'pod.', b'portsweep.'])],
#     'test_data':None,
#     'test_target':None
# })
# california_housing 
california_housing = datasets.fetch_california_housing(as_frame=True)
california_housing.update({
    'name':'california_housing',
    'task':'regression',
    'cat_features':[],
    'cont_features':[i for i in range(7)],
#     'data': already specified when loading the dataset from scikit-learn
#     'target': already specified when loading the dataset from scikit-learn
    'test_data':None,
    'test_target':None
})

# UCI Datasets

In [None]:
# download dataset from uci
url_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
nan_token = ' ?'
adult = {
    'name':'adult',
    'task':'classification',
    'cat_features':[1, 3, 5, 6, 7, 8, 9, 13],
    'cont_features':[0, 2, 4, 10, 11, 12],
    'data':pd.read_csv(url_data, header=None, na_values=nan_token).iloc[:,0:-1],
    'target':pd.read_csv(url_data, header=None, na_values=nan_token).iloc[:,-1],
    'test_data':pd.read_csv(url_test, header=None, skiprows=1, na_values=nan_token).iloc[:,0:-1],
    'test_target':pd.read_csv(url_test, header=None, skiprows=1, na_values=nan_token).iloc[:,-1].str[:-1]
}

In [None]:
datasets = [iris, diabetes, wine, breast_cancer, covtype, fetch_kddcup99_sf, california_housing, adult]

In [4]:
for dataset in scikit_datasets:
    print_dataset_characteristics(dataset)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :