# Datasets

In [1]:
import os
from os.path import join

import numpy as np
import pandas as pd

from sklearn import datasets

In [6]:
def check_mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
    else:
        print(f'{path} - already exist')

In [8]:
base_path = ''

## sklearn datasets

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
sklearn_path = join(base_path, 'sklearn')

In [31]:
dataset_path_pairs = [
    (join(sklearn_path, 'breast_cancer'), datasets.load_breast_cancer(), 'breast_cancer.csv') ,
    (join(sklearn_path, 'boston'), datasets.load_boston(), 'boston.csv'),
    (join(sklearn_path, 'diabetes'), datasets.load_diabetes(), 'diabetes.csv'),
    (join(sklearn_path, 'iris'), datasets.load_iris(), 'iris.csv'),
    (join(sklearn_path, 'wine'), datasets.load_wine(), 'wine.csv'),
]

In [32]:
for path, data, filename in dataset_path_pairs:
    check_mkdir(path)
    
    X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=42)

    df_train = pd.DataFrame(X_train, columns=data['feature_names'])
    df_train['target'] = y_train

    df_test = pd.DataFrame(X_test, columns=data['feature_names'])
    df_test['target'] = y_test

    df_train.to_csv(join(path, 'train_' + filename), index=False)
    df_test.to_csv(join(path, 'test_' + filename), index=False)

    with open(join(path, 'Readme.md'), 'w') as f:
        f.write(data['DESCR'])
    
    print(path)


sklearn/breast_cancer - already exist
sklearn/breast_cancer
sklearn/boston - already exist
sklearn/boston
sklearn/diabetes - already exist
sklearn/diabetes
sklearn/iris - already exist
sklearn/iris
sklearn/wine - already exist
sklearn/wine


## OpenML datasets

In [33]:
openml_path = join(base_path, 'openml')

### BinaryClassification

In [36]:
bin_class_dataset_names = [
    'credit-g', 
    'blood-transfusion-service-center', 
    'monks-problems-2',
    'tic-tac-toe',
    'monks-problems-1',
    'steel-plates-fault',
    'kr-vs-kp',
    'qsar-biodeg',
    'wdbc',
    'phoneme',
    'diabetes',
    'ozone-level-8hr',
    'hill-valley',
    'kc1',
    'kc2',
    'eeg-eye-state',
    'climate-model-simulation-crashes',
    'spambase',
    'ilpd',
    'banknote-authentication',
    'electricity',
    'madelon',
    'gina_agnostic',
    'bank-marketing',
    'Click_prediction_small',
    'PhishingWebsites',
    'Bioresponse',
    'Amazon_employee_access',
    'SpeedDating',
    'credit-approval',
    'irish',
    'churn',
    
]

In [37]:
bin_class_path = join(openml_path, 'binary_classification')

In [38]:
for data_name in bin_class_dataset_names:
    data = datasets.fetch_openml(data_name)
    
    path = join(bin_class_path, data_name)
    filename = data_name + '.csv'

    check_mkdir(path)
    
    X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=42)

    df_train = pd.DataFrame(X_train, columns=data['feature_names'])
    df_train['target'] = y_train

    df_test = pd.DataFrame(X_test, columns=data['feature_names'])
    df_test['target'] = y_test

    df_train.to_csv(join(path, 'train_' + filename), index=False)
    df_test.to_csv(join(path, 'test_' + filename), index=False)

    with open(join(path, 'Readme.md'), 'w') as f:
        f.write(data['DESCR'])
    
    print(path)

  warn(
  warn(


openml/binary_classification/credit-g
openml/binary_classification/blood-transfusion-service-center
openml/binary_classification/monks-problems-2
openml/binary_classification/tic-tac-toe
openml/binary_classification/monks-problems-1
openml/binary_classification/steel-plates-fault
openml/binary_classification/kr-vs-kp
openml/binary_classification/qsar-biodeg
openml/binary_classification/wdbc


  warn(
  warn(


openml/binary_classification/phoneme
openml/binary_classification/diabetes
openml/binary_classification/ozone-level-8hr


  warn(


openml/binary_classification/hill-valley
openml/binary_classification/kc1
openml/binary_classification/kc2
openml/binary_classification/eeg-eye-state
openml/binary_classification/climate-model-simulation-crashes


  warn(


openml/binary_classification/spambase
openml/binary_classification/ilpd
openml/binary_classification/banknote-authentication


  warn(
  warn(


openml/binary_classification/electricity
openml/binary_classification/madelon
openml/binary_classification/gina_agnostic


  warn(


openml/binary_classification/bank-marketing


  warn(


openml/binary_classification/Click_prediction_small
openml/binary_classification/PhishingWebsites
openml/binary_classification/Bioresponse


  warn(


openml/binary_classification/Amazon_employee_access


  warn(


openml/binary_classification/SpeedDating
openml/binary_classification/credit-approval
openml/binary_classification/irish
openml/binary_classification/churn


In [21]:


for i, dataset_name in enumerate(bin_class_dataset_names):
    data = datasets.fetch_openml(dataset_name)
    print(i)
    print()

  warn(
  warn(


0

1

2

3

4

5

6

7

8

9

10



  warn(
  warn(
  warn(


11

12

13

14

15



  warn(


16

17



  warn(


18

19



  warn(


20

21

22



  warn(


23



  warn(


24

25

26



  warn(


27



  warn(


28

29

30

31

