# Get Datasets

In [None]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from openml import datasets
import os

if os.path.exists('dataset_processing_log.csv'):
    preprocessing_log = pd.read_csv('dataset_processing_log.csv')
else:
    preprocessing_log = pd.DataFrame(columns=['did', 'Dataset Name', 'Class Column' , 'No. of Rows', 'No. of Columns', 'No. of Extracted Meta-Features', 'One-Hot Encoded?', 'Label Encoded?'])

openml_datasets_index = pd.read_csv('openml_datasets_index.csv')
uci_datasets_index = pd.read_csv('uci_datasets_index.csv')

openml_index = openml_datasets_index
uci_index = uci_datasets_index

# create openml index (with filtering)

In [None]:
from math import ceil
openml_index = openml_datasets_index
uci_index = uci_datasets_index


openml_index = openml_index[openml_index['NumberOfClasses'] == 2]
openml_index = openml_index[openml_index['NumberOfMissingValues'] == 0]
openml_index = openml_index[openml_index['NumberOfInstancesWithMissingValues'] == 0]

passing_dids = []
for _row in openml_index.iterrows():
    row = _row[1]
    
    dataset_size = row['NumberOfInstances']
    pos_prop = row['MajorityClassSize'] / dataset_size
    neg_prop = row['MinorityClassSize'] / dataset_size

    sample_size = 100
    test_size = 0.5

    pos_number_sample_test = ceil(pos_prop * dataset_size * test_size)
    neg_number_sample_test = ceil(neg_prop * dataset_size * test_size)

    if pos_number_sample_test > sample_size and neg_number_sample_test > sample_size:
        passing_dids.append(row['did'])

openml_index = openml_index[openml_index['did'].isin(passing_dids)]

# fetch_datasets

In [None]:
import openml
import requests
import pickle
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def fetch_datasets(openml_index: pd.DataFrame):
    # Patch requests so it doesn't wait or retry
    retry_strategy = Retry(
        total=1,            # Only one retry
        backoff_factor=0.1, # Delay between retries
        status_forcelist=[429, 500, 502, 503, 504],  # Retry for specific status codes
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    openml.config.requests_session = session

    if os.path.exists("./untreated_datasets/openml_did_index.csv"):
        openml_did_index = pd.read_csv('./untreated_datasets/openml_did_index.csv')
    else:
        openml_did_index = pd.DataFrame(columns=['did', 'file'])

    did_list = list(set(openml_index['did'].tolist()) - set(openml_did_index['did'].tolist()))
    did_list.sort()

    import pdb;pdb.set_trace()

    # i = -1
    for did in did_list:
        # i += 1
        # if i == 20:
        #     break
        try:
            dataset = openml.datasets.get_dataset(int(did), download_data=True)
            dataset_name = dataset.name
            X, y, categorical_indicator, attribute_names = dataset.get_data(
                dataset_format="dataframe", target=dataset.default_target_attribute
            )

            with open(f"./untreated_datasets/{dataset_name}.pkl", 'wb') as f:
                pickle.dump({
                    'dataset_name': dataset_name,
                    'X': X,
                    'y': y,
                    'categorical_indicator': categorical_indicator,
                    'attribute_names': attribute_names
                }, f)
            
            openml_did_index.loc[len(openml_did_index)] = [did, f'{dataset_name}.pkl']
            openml_did_index.to_csv('./untreated_datasets/openml_did_index.csv', index=False)
        except:
            continue

# # # fetch_datasets(openml_index=openml_index)

In [4]:
# import pickle

# file = "BNG(breast-cancer,nominal,1000000).pkl"
# with open(f"./untreated_datasets/{file}", 'rb') as f:
#     data = pickle.load(f)
#     dataset_name = data['dataset_name']
#     X = data['X']
#     y = data['y']
#     categorical_indicator = data['categorical_indicator']
#     attribute_names = data['attribute_names']

In [None]:
def load_dataframe(file):
    with open(f"./untreated_datasets/{file}", 'rb') as f:
        data = pickle.load(f)
        dataset_name = data['dataset_name']
        X = data['X']
        y = data['y']
        categorical_indicator = data['categorical_indicator']
        attribute_names = data['attribute_names']
    return dataset_name, X, y, categorical_indicator, attribute_names

# treat_datasets

In [None]:
from utils import run_tests, run_with_timeout 

did_list = openml_index['did'].tolist()
datasets_pickles = [f for f in os.listdir('./untreated_datasets') if f.endswith('.pkl')]

if not os.path.exists('current_did.txt'):
    with open('current_did.txt', 'w') as f:
        f.write(str(did_list[0]))

with open('current_did.txt', 'r') as f:
    current_did = int(f.read().strip())

openml_did_index = pd.read_csv('./untreated_datasets/openml_did_index.csv')
_start = did_list.index(current_did)
for i in range(_start, len(did_list)):
    # if i == 5: break
    print(f"ITER. {i} - DID. {did_list[i]}")

    try:
        did = did_list[i]
        with open('current_did.txt', 'w') as f:
            f.write(str(did))

        row = openml_did_index[openml_did_index['did'] == did]
        file = row['file'].tolist()[0]
        # import pdb;pdb.set_trace()
        with open(f"./untreated_datasets/{file}", 'rb') as f:
            data = pickle.load(f)
            dataset_name = data['dataset_name']
            X = data['X']
            y = data['y']
            categorical_indicator = data['categorical_indicator']
            attribute_names = data['attribute_names']
        # import pdb;pdb.set_trace()
        # dataset = datasets.get_dataset(int(did))

        result = run_tests(X, y, categorical_indicator)
        # result = run_with_timeout(600, run_tests, X, y, categorical_indicator)
        # if result is None:
        #     continue
        
        X_processed, y_processed, dataset_shape, num_meta_features_extracted, one_hot_encoded, label_encoded = result
        data_processed = np.column_stack((X_processed, y_processed))
        
        # X_processed, y_processed, dataset_shape, num_meta_features_extracted, one_hot_encoded, label_encoded = run_tests(X, y, categorical_indicator)
        
        processed_df = pd.DataFrame(columns=[_i for _i in range(1, data_processed.shape[1] + 1)], data=data_processed)

        preprocessing_log.loc[len(preprocessing_log.index)] = [did, dataset_name, y.name, dataset_shape[0], dataset_shape[1] + 1, num_meta_features_extracted, one_hot_encoded, label_encoded]
        processed_df.to_csv(f"./treated_datasets/{dataset_name}.csv", index=False)
        preprocessing_log.to_csv("dataset_processing_log.csv", index=False)
    except:
        continue

    # dataset_name = dataset.name
    # X, y, categorical_indicator, attribute_names = dataset.get_data(
    #     dataset_format="dataframe", target=dataset.default_target_attribute
    # )


    ######### HERE
    # result = run_tests(X, y, categorical_indicator)
    # # result = run_with_timeout(600, run_tests, X, y, categorical_indicator)
    # # if result is None:
    # #     continue
    
    # X_processed, y_processed, dataset_shape, num_meta_features_extracted, one_hot_encoded, label_encoded = result
    # data_processed = np.column_stack((X_processed, y_processed))
    
    # # X_processed, y_processed, dataset_shape, num_meta_features_extracted, one_hot_encoded, label_encoded = run_tests(X, y, categorical_indicator)
    
    # processed_df = pd.DataFrame(columns=[_i for _i in range(1, data_processed.shape[1] + 1)], data=data_processed)

    # preprocessing_log.loc[len(preprocessing_log.index)] = [did, dataset_name, y.name, dataset_shape[0], dataset_shape[1] + 1, num_meta_features_extracted, one_hot_encoded, label_encoded]
    # processed_df.to_csv(f"./treated_datasets/{dataset_name}.csv", index=False)
    # preprocessing_log.to_csv("dataset_processing_log.csv", index=False)

preprocessing_log

ITER. 0 - DID. 3
ITER. 1 - DID. 31
ITER. 2 - DID. 37
ITER. 3 - DID. 44
ITER. 4 - DID. 50
ITER. 5 - DID. 72
ITER. 6 - DID. 73
ITER. 7 - DID. 77
ITER. 8 - DID. 120
ITER. 9 - DID. 121
ITER. 10 - DID. 122
ITER. 11 - DID. 124
ITER. 12 - DID. 126
ITER. 13 - DID. 128
ITER. 14 - DID. 131
ITER. 15 - DID. 132
ITER. 16 - DID. 135
ITER. 17 - DID. 137
ITER. 18 - DID. 139
ITER. 19 - DID. 140
ITER. 20 - DID. 142
ITER. 21 - DID. 143
ITER. 22 - DID. 146
ITER. 23 - DID. 151
ITER. 24 - DID. 152
ITER. 25 - DID. 153
ITER. 26 - DID. 161
ITER. 27 - DID. 162
ITER. 28 - DID. 246
ITER. 29 - DID. 251
ITER. 30 - DID. 256
ITER. 31 - DID. 257
ITER. 32 - DID. 258
ITER. 33 - DID. 260
ITER. 34 - DID. 262
ITER. 35 - DID. 264
ITER. 36 - DID. 267
ITER. 37 - DID. 269
ITER. 38 - DID. 273
ITER. 39 - DID. 274
ITER. 40 - DID. 293
ITER. 41 - DID. 310
ITER. 42 - DID. 312
ITER. 43 - DID. 333
ITER. 44 - DID. 334
ITER. 45 - DID. 335
ITER. 46 - DID. 350
ITER. 47 - DID. 351
ITER. 48 - DID. 354
ITER. 49 - DID. 357
ITER. 50 - DID. 715

Unnamed: 0,did,Dataset Name,Class Column,No. of Rows,No. of Columns,No. of Extracted Meta-Features,One-Hot Encoded?,Label Encoded?
0,3,kr-vs-kp,class,3196,38,0,True,True
1,31,credit-g,class,1000,49,0,True,True
2,37,diabetes,class,768,9,0,False,True
3,44,spambase,class,4601,58,0,False,True
4,50,tic-tac-toe,Class,958,19,0,True,True
...,...,...,...,...,...,...,...,...
661,46553,Loan_Status,loan_status,45000,23,0,True,True
662,46554,Loan_Status,loan_status,45000,23,0,True,True
663,46562,German-Credit-Data-Creditability-Preprocessed-...,creditability,1000,34,0,False,True
664,46563,Loan_Approval_Status_Classification,loan_status,45000,14,0,False,True
