# Prepare dataset for OpenDataGen

You can prepare new dataset for OpenDataGen in following 2 steps:

1. Generate config.json which contain useful information for dataset
2. Split the csv into train.csv and test.csv and put them in the folder

## Config.json Generation

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import os

In [2]:
# config
data_file_list = [
    'company_bankruptcy_prediction.csv',
    'dna_40670.csv',
    'parkinsons_updrs.csv',
    'credit_card_customers.csv',
    'PieChart3_1453.csv',
    'darwin.csv',
    'jannis_41168.csv'
]

data_name_list = [
    'company_bankruptcy_prediction',
    'dna',
    'parkinsons_updrs',
    'credit_card_customers',
    'piechart3',
    'darwin',
    'jannis'
]

data_file_name_dic = {
    'company_bankruptcy_prediction.csv': 'company_bankruptcy_prediction',
    'dna_40670.csv': 'dna',
    'parkinsons_updrs.csv': 'parkinsons_updrs',
    'credit_card_customers.csv': 'credit_card_customers',
    'PieChart3_1453.csv': 'piechart3',
    'darwin.csv': 'darwin',
    'jannis_41168.csv': 'jannis'
}

dataset_path = './datasets'

In [3]:
def get_dataset_config(data_file, data_file_name_dic, dataset_path):
    data = pd.read_csv(data_file)
    # train test split
    data_train, data_test, _, _ = train_test_split(data, data, test_size=0.33, random_state=42)

    # get name
    name = data_file_name_dic[data_file]
#     print('name:', name)
    # get 4 num: column_num train_row_num test_row_num total_row_num
    column_num = len(list(data.columns))
    train_row_num = len(data_train)
    test_row_num = len(data_test)
    total_row_num = len(data)
#     print('column_num:', column_num)
#     print('train_row_num:', train_row_num)
#     print('test_row_num:', test_row_num)
#     print('total_row_num:', total_row_num)

    os.mkdir("./{}".format(name))

    train_file = "{}/{}/train.csv".format(dataset_path, name)
    test_file = "{}/{}/test.csv".format(dataset_path, name)
#     print('train_file:', train_file)
#     print('test_file:', test_file)

    train_file_save = "{}/train.csv".format(name)
    test_file_save = "{}/test.csv".format(name)
#     print('train_file_save:', train_file_save)
#     print('test_file_save:', test_file_save)

    data_train.to_csv(train_file_save, index=0, header=0)
    data_test.to_csv(test_file_save, index=0, header=0)

    tmp_config = {
        "name": name,
        "column_num": column_num,
        "train_row_num": train_row_num,
        "test_row_num": test_row_num,
        "total_row_num": total_row_num,
        "train_file": train_file,
        "test_file": test_file
    }

    # get columns
    tmp_config_columns = []
    for column in data.columns:
        column_name = column
        column_type = str(data[column].dtype)
        column_value = []
        if column_type == 'object':
            column_value = list(data[column].unique())
            column_type = 'string'
        elif 'int' in column_type:
            column_type = 'int'
        elif 'float' in column_type:
            column_type = 'float'
        else:
            raise TypeError('column_type is not in [string, int, float]')
#         print('column_name:', column_name)
#         print('column_type:', column_type)
#         print('column_value:', column_value)
        tmp_config_columns.append({
            'column_name': column_name,
            'column_type': column_type,
            'column_value': column_value
        })

    tmp_config['columns'] = tmp_config_columns
    return tmp_config

def get_datasets_config(data_file_list, data_file_name_dic, dataset_path):
    datasets = []
    for data_file in data_file_list:
        tmp_config = get_dataset_config(data_file, data_file_name_dic, dataset_path)
        datasets.append(tmp_config)
    return datasets

In [9]:
datasets = get_datasets_config(data_file_list, data_file_name_dic, dataset_path)

In [16]:
config = json.load(open("//home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/src/config_ori.json"))
config1202 = config
config1202['datasets'] = datasets

config_file1202 = '/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/src/config.json'
js_str = json.dumps(config1202, indent=4)
js_file = open(config_file1202, 'w')
js_file.write(js_str)

162089

## Move csv to the path

In [26]:
source_path = "."
target_path = "/home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets"

for name in data_name_list:
    print("mv {}/{} {}/{}".format(source_path, name, target_path, name))

mv ./company_bankruptcy_prediction /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/company_bankruptcy_prediction
mv ./dna /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/dna
mv ./parkinsons_updrs /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/parkinsons_updrs
mv ./credit_card_customers /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/credit_card_customers
mv ./piechart3 /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/piechart3
mv ./darwin /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/darwin
mv ./jannis /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/jannis


In [27]:
# run the code above in cmd
# mv ./company_bankruptcy_prediction /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/company_bankruptcy_prediction
# mv ./dna /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/dna
# mv ./parkinsons_updrs /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/parkinsons_updrs
# mv ./credit_card_customers /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/credit_card_customers
# mv ./piechart3 /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/piechart3
# mv ./darwin /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/darwin
# mv ./jannis /home/ruc/xiaotong/OpenDataGen/log/20231201/open-data-gen/datasets/jannis

In [35]:
pwd

'/home/ruc/xiaotong/OpenDataGen/log/20231122/open-data-gen/candidate_data_tong/20231202'