In [1]:
import os
import pandas as pd
import json
import warnings
import shutil
warnings.filterwarnings('ignore')

In [2]:
def make_json(path_directory_datasets, path_dataset_json):

    os.mkdir(os.path.dirname(path_dataset_json))

    list_json_string = []
    for dir_name in os.listdir(path_directory_datasets):

        directory_group = os.path.join(path_directory_datasets, dir_name)

        for file_name in os.listdir(directory_group):

            file = os.path.join(directory_group, file_name)
            dataset_name = os.path.splitext(file_name)[0]
            file_extension = os.path.splitext(file_name)[1]

            if file_extension == ".jsonl":
                df = pd.read_json(file, lines=True, encoding="utf-8")
            elif file_extension == ".json":
                df = pd.read_json(file, encoding="utf-8")
            elif file_extension == ".csv":
                df = pd.read_csv(file, encoding="latin-1")
            elif file_extension == ".xlsx":
                df = pd.read_excel(file, engine='openpyxl')
            elif file_extension == ".xls":
                df = pd.read_excel(file)

            obj = {
                "group_name": dir_name,
                "dataset_name": dataset_name,
                "dataset": df.to_json(orient="index")
            }
            json_object = json.dumps(obj, indent=4)
            list_json_string.append(json_object)

    with open(path_dataset_json, "a") as json_file:
        json_file.write("[")
        for i in range(0, len(list_json_string) - 1):
            json_file.write(list_json_string[i])
            json_file.write(",")
        json_file.write(list_json_string[len(list_json_string) - 1])
        json_file.write("]")
        json_file.close()

In [3]:
def make_clusters(path_dataset_json, path_cluster):

    os.mkdir(path_cluster)

    with open(path_dataset_json, 'r', encoding='utf-8') as f:
        json_object = json.loads(f.read())
        f.close()

    for dataset in json_object:
        file_name = dataset["group_name"]
        cluster_name = dataset["dataset_name"]
        ds = dataset["dataset"]

        directory = os.path.join(path_cluster, cluster_name)
        if not os.path.exists(directory):
            os.mkdir(directory)

        file = os.path.join(directory, file_name)
        df = pd.read_json(ds, orient="index", encoding="utf-8")
        df.to_csv(file+".csv", index=False)

In [4]:
def main():
    path_directory_datasets = ".\\Datasets\\"
    path_dataset_json = ".\\DatasetJSON\\"
    file_name_dataset_json = "datasets.json"
    path_cluster = ".\\ClustersCSV\\"

    if os.path.exists(path_dataset_json):
        shutil.rmtree(path_dataset_json)
    if os.path.exists(path_cluster):
        shutil.rmtree(path_cluster)
        
    make_json(path_directory_datasets, path_dataset_json + file_name_dataset_json)
    make_clusters(path_dataset_json + file_name_dataset_json, path_cluster)

In [5]:
main()