# Datasets

In [1]:
import os
from os.path import join

import numpy as np
import pandas as pd

from sklearn import datasets

In [6]:
def check_mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
    else:
        print(f'{path} - already exist')

In [8]:
base_path = ''

## sklearn datasets

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
sklearn_path = join(base_path, 'sklearn')

In [31]:
dataset_path_pairs = [
    (join(sklearn_path, 'breast_cancer'), datasets.load_breast_cancer(), 'breast_cancer.csv') ,
    (join(sklearn_path, 'boston'), datasets.load_boston(), 'boston.csv'),
    (join(sklearn_path, 'diabetes'), datasets.load_diabetes(), 'diabetes.csv'),
    (join(sklearn_path, 'iris'), datasets.load_iris(), 'iris.csv'),
    (join(sklearn_path, 'wine'), datasets.load_wine(), 'wine.csv'),
]

In [32]:
for path, data, filename in dataset_path_pairs:
    check_mkdir(path)
    
    X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=42)

    df_train = pd.DataFrame(X_train, columns=data['feature_names'])
    df_train['target'] = y_train

    df_test = pd.DataFrame(X_test, columns=data['feature_names'])
    df_test['target'] = y_test

    df_train.to_csv(join(path, 'train_' + filename), index=False)
    df_test.to_csv(join(path, 'test_' + filename), index=False)

    with open(join(path, 'Readme.md'), 'w') as f:
        f.write(data['DESCR'])
    
    print(path)


sklearn/breast_cancer - already exist
sklearn/breast_cancer
sklearn/boston - already exist
sklearn/boston
sklearn/diabetes - already exist
sklearn/diabetes
sklearn/iris - already exist
sklearn/iris
sklearn/wine - already exist
sklearn/wine


## OpenML datasets

In [33]:
openml_path = join(base_path, 'openml')

### BinaryClassification

In [36]:
bin_class_dataset_names = [
    'credit-g', 
    'blood-transfusion-service-center', 
    'monks-problems-2',
    'tic-tac-toe',
    'monks-problems-1',
    'steel-plates-fault',
    'kr-vs-kp',
    'qsar-biodeg',
    'wdbc',
    'phoneme',
    'diabetes',
    'ozone-level-8hr',
    'hill-valley',
    'kc1',
    'kc2',
    'eeg-eye-state',
    'climate-model-simulation-crashes',
    'spambase',
    'ilpd',
    'banknote-authentication',
    'electricity',
    'madelon',
    'gina_agnostic',
    'bank-marketing',
    'Click_prediction_small',
    'PhishingWebsites',
    'Bioresponse',
    'Amazon_employee_access',
    'SpeedDating',
    'credit-approval',
    'irish',
    'churn', 
]

In [37]:
bin_class_path = join(openml_path, 'binary_classification')

In [38]:
for data_name in bin_class_dataset_names:
    data = datasets.fetch_openml(data_name)
    
    path = join(bin_class_path, data_name)
    filename = data_name + '.csv'

    check_mkdir(path)
    
    X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=42)

    df_train = pd.DataFrame(X_train, columns=data['feature_names'])
    df_train['target'] = y_train

    df_test = pd.DataFrame(X_test, columns=data['feature_names'])
    df_test['target'] = y_test

    df_train.to_csv(join(path, 'train_' + filename), index=False)
    df_test.to_csv(join(path, 'test_' + filename), index=False)

    with open(join(path, 'Readme.md'), 'w') as f:
        f.write(data['DESCR'])
    
    print(path)

  warn(
  warn(


openml/binary_classification/credit-g
openml/binary_classification/blood-transfusion-service-center
openml/binary_classification/monks-problems-2
openml/binary_classification/tic-tac-toe
openml/binary_classification/monks-problems-1
openml/binary_classification/steel-plates-fault
openml/binary_classification/kr-vs-kp
openml/binary_classification/qsar-biodeg
openml/binary_classification/wdbc


  warn(
  warn(


openml/binary_classification/phoneme
openml/binary_classification/diabetes
openml/binary_classification/ozone-level-8hr


  warn(


openml/binary_classification/hill-valley
openml/binary_classification/kc1
openml/binary_classification/kc2
openml/binary_classification/eeg-eye-state
openml/binary_classification/climate-model-simulation-crashes


  warn(


openml/binary_classification/spambase
openml/binary_classification/ilpd
openml/binary_classification/banknote-authentication


  warn(
  warn(


openml/binary_classification/electricity
openml/binary_classification/madelon
openml/binary_classification/gina_agnostic


  warn(


openml/binary_classification/bank-marketing


  warn(


openml/binary_classification/Click_prediction_small
openml/binary_classification/PhishingWebsites
openml/binary_classification/Bioresponse


  warn(


openml/binary_classification/Amazon_employee_access


  warn(


openml/binary_classification/SpeedDating
openml/binary_classification/credit-approval
openml/binary_classification/irish
openml/binary_classification/churn


### Regression

In [44]:
regression_dataset_names = [
    'cholesterol', 
    'cloud',
    'analcatdata_negotiation',
    'meta',
    'bodyfat',
    'CPMP-2015-regression',
    'kin8nm',
    'plasma_retinol',
    'auto_price',
    'topo_2_1',
    'mv',
    'puma8NH',
]

In [45]:
bin_class_path = join(openml_path, 'regression')

In [46]:
for data_name in regression_dataset_names:
    data = datasets.fetch_openml(data_name)
    
    path = join(bin_class_path, data_name)
    filename = data_name + '.csv'

    check_mkdir(path)
    
    X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=42)

    df_train = pd.DataFrame(X_train, columns=data['feature_names'])
    df_train['target'] = y_train

    df_test = pd.DataFrame(X_test, columns=data['feature_names'])
    df_test['target'] = y_test

    df_train.to_csv(join(path, 'train_' + filename), index=False)
    df_test.to_csv(join(path, 'test_' + filename), index=False)

    with open(join(path, 'Readme.md'), 'w') as f:
        f.write(data['DESCR'])
    
    print(path)

  warn(


openml/regression/cholesterol


  warn(


openml/regression/cloud


  warn(


openml/regression/analcatdata_negotiation


  warn(


openml/regression/meta


  warn(


openml/regression/bodyfat
openml/regression/CPMP-2015-regression


  warn(


openml/regression/kin8nm


  warn(


openml/regression/plasma_retinol


  warn(


openml/regression/auto_price
openml/regression/topo_2_1


  warn(


openml/regression/mv


  warn(


openml/regression/puma8NH


### MulticlassClassification

In [47]:
multiclass_dataset_names = [
    'pbc',
    'liver-disorders', 
    'cleveland',
    'analcatdata_gsssexsurvey',
    'chscase_foot',
    'cpu_small',
    'pol',
]

In [48]:
bin_class_path = join(openml_path, 'multiclass')

In [49]:
for data_name in multiclass_dataset_names:
    data = datasets.fetch_openml(data_name)
    
    path = join(bin_class_path, data_name)
    filename = data_name + '.csv'

    check_mkdir(path)
    
    X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=42)

    df_train = pd.DataFrame(X_train, columns=data['feature_names'])
    df_train['target'] = y_train

    df_test = pd.DataFrame(X_test, columns=data['feature_names'])
    df_test['target'] = y_test

    df_train.to_csv(join(path, 'train_' + filename), index=False)
    df_test.to_csv(join(path, 'test_' + filename), index=False)

    with open(join(path, 'Readme.md'), 'w') as f:
        f.write(data['DESCR'])
    
    print(path)

  warn(


openml/multiclass/pbc
openml/multiclass/liver-disorders


  warn(


openml/multiclass/cleveland


  warn(


openml/multiclass/analcatdata_gsssexsurvey
openml/multiclass/chscase_foot


  warn(


openml/multiclass/cpu_small


  warn(


openml/multiclass/pol


## Kaggle

Для скачивания нужно использовать kaggle api токен получаемые в аккаунте

In [51]:
pip install -q kaggle

Note: you may need to restart the kernel to use updated packages.


In [52]:
!mkdir -p ~/.kaggle

In [53]:
!cp kaggle.json ~/.kaggle/

In [57]:
!chmod 600 ~/.kaggle/kaggle.json

In [59]:
!kaggle datasets list

ref                                                             title                                       size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -----------------------------------------  -----  -------------------  -------------  ---------  ---------------  
akshaydattatraykhare/diabetes-dataset                           Diabetes Dataset                             9KB  2022-10-06 08:55:25          13228        412  1.0              
whenamancodes/covid-19-coronavirus-pandemic-dataset             COVID -19 Coronavirus Pandemic Dataset      11KB  2022-09-30 04:05:11          10445        325  1.0              
thedevastator/240000-household-electricity-consumption-records  Household Electricity Consumption            3MB  2022-10-24 01:22:40            886         28  1.0              
akshaydattatraykhare/data-for-admission-in-the-university       Data for Admission in the University

In [None]:
!kaggle datasets download -d 

### BinaryClassification