# Get Datasets

In [1]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from openml import datasets
import os

if os.path.exists('dataset_processing_log.csv'):
    preprocessing_log = pd.read_csv('dataset_processing_log.csv')
else:
    preprocessing_log = pd.DataFrame(columns=['did', 'Dataset Name', 'Class Column' , 'No. of Rows', 'No. of Columns', 'No. of Extracted Meta-Features', 'One-Hot Encoded?', 'Label Encoded?'])

openml_datasets_index = pd.read_csv('openml_datasets_index.csv')
uci_datasets_index = pd.read_csv('uci_datasets_index.csv')

openml_index = openml_datasets_index
uci_index = uci_datasets_index

In [2]:
from math import ceil
openml_index = openml_datasets_index
uci_index = uci_datasets_index


openml_index = openml_index[openml_index['NumberOfClasses'] == 2]
openml_index = openml_index[openml_index['NumberOfMissingValues'] == 0]
openml_index = openml_index[openml_index['NumberOfInstancesWithMissingValues'] == 0]

passing_dids = []
for _row in openml_index.iterrows():
    row = _row[1]
    
    dataset_size = row['NumberOfInstances']
    pos_prop = row['MajorityClassSize'] / dataset_size
    neg_prop = row['MinorityClassSize'] / dataset_size

    sample_size = 100
    test_size = 0.5

    pos_number_sample_test = ceil(pos_prop * dataset_size * test_size)
    neg_number_sample_test = ceil(neg_prop * dataset_size * test_size)

    if pos_number_sample_test > sample_size and neg_number_sample_test > sample_size:
        passing_dids.append(row['did'])

openml_index = openml_index[openml_index['did'].isin(passing_dids)]

openml_index

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
1,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
26,31,credit-g,1,1,active,ARFF,700.0,10.0,300.0,2.0,21.0,1000.0,0.0,0.0,7.0,14.0
31,37,diabetes,1,1,active,ARFF,500.0,2.0,268.0,2.0,9.0,768.0,0.0,0.0,8.0,1.0
38,44,spambase,1,1,active,ARFF,2788.0,2.0,1813.0,2.0,58.0,4601.0,0.0,0.0,57.0,1.0
42,50,tic-tac-toe,1,1,active,ARFF,626.0,3.0,332.0,2.0,10.0,958.0,0.0,0.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5860,46553,Loan_Status,1,45956,active,arff,35000.0,,10000.0,2.0,14.0,45000.0,0.0,0.0,8.0,5.0
5861,46554,Loan_Status,2,45956,active,arff,35000.0,,10000.0,2.0,14.0,45000.0,0.0,0.0,8.0,6.0
5869,46562,German-Credit-Data-Creditability-Preprocessed-...,1,45575,active,arff,700.0,,300.0,2.0,34.0,1000.0,0.0,0.0,33.0,1.0
5870,46563,Loan_Approval_Status_Classification,2,45956,active,arff,35000.0,,10000.0,2.0,14.0,45000.0,0.0,0.0,13.0,1.0


# Download Datasets

In [3]:
import openml
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Patch requests so it doesn't wait or retry
retry_strategy = Retry(
    total=1,            # Only one retry
    backoff_factor=0.1, # Delay between retries
    status_forcelist=[429, 500, 502, 503, 504],  # Retry for specific status codes
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session = requests.Session()
session.mount("http://", adapter)
session.mount("https://", adapter)
openml.config.requests_session = session

In [4]:
import pickle

did_list = openml_index['did'].tolist()
i = -1
for did in did_list:
    i += 1
    if i == 5:
        break
    try:
        dataset = openml.datasets.get_dataset(int(did), download_data=True)
        dataset_name = dataset.name
        X, y, categorical_indicator, attribute_names = dataset.get_data(
            dataset_format="dataframe", target=dataset.default_target_attribute
        )
        X[y.name] = y

        X.to_csv(f"./untreated_datasets/{dataset_name}__{did}.csv", index=False)
        with open(f"./untreated_datasets/{dataset_name}_meta.pkl", 'wb') as f:
            pickle.dump({'did': did, 'dataset_name': dataset_name, 'class_name': y.name, 'categorical_indicator': categorical_indicator, 'attribute_names': attribute_names}, f)
    except:
        continue

# Run

In [None]:
from run_tests import run_tests
from utils import run_with_timeout

did_list = openml_index['did'].tolist()

if not os.path.exists('current_did.txt'):
    with open('current_did.txt', 'w') as f:
        f.write(str(did_list[0]))

with open('current_did.txt', 'r') as f:
    current_did = int(f.read().strip())

_start = did_list.index(current_did)
for i in range(_start, len(did_list)):
    print(f"ITER. {i} - DID. {did_list[i]}")

    try:
        did = did_list[i]
        with open('current_did.txt', 'w') as f:
            f.write(str(did))

        dataset = datasets.get_dataset(int(did))
    except:
        continue

    dataset_name = dataset.name
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute
    )

    result = run_with_timeout(600, run_tests, X, y, categorical_indicator)
    if result is None:
        continue
    
    X_processed, y_processed, dataset_shape, num_meta_features_extracted, one_hot_encoded, label_encoded = result
    data_processed = np.column_stack((X_processed, y_processed))
    
    # X_processed, y_processed, dataset_shape, num_meta_features_extracted, one_hot_encoded, label_encoded = run_tests(X, y, categorical_indicator)
    
    processed_df = pd.DataFrame(columns=[_i for _i in range(1, data_processed.shape[1] + 1)], data=data_processed)

    preprocessing_log.loc[len(preprocessing_log.index)] = [did, dataset_name, y.name, dataset_shape[0], dataset_shape[1], num_meta_features_extracted, one_hot_encoded, label_encoded]
    processed_df.to_csv(f"./treated_datasets/{dataset_name}.csv", index=False)
    preprocessing_log.to_csv("dataset_processing_log.csv", index=False)

preprocessing_log