In [None]:
import glob
import os
import json
import re
import pandas as pd

In [None]:
SOURCE_BASE_DIR = 'Data/retail_db'
TARGET_BASE_DIR = 'Data/retail_db_json'

In [None]:
def get_column_names(schemas, ds_name, sorting_key = 'column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda x: x[sorting_key])
    return [col['column_name'] for col in columns]

In [None]:
def read_csv_to_df(file, schemas):
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns, header=None)
    return df

In [None]:
def convert_to_jsonl(df, ds_name, file_name):
    json_file_path = f'{TARGET_BASE_DIR}/{ds_name}/{file_name}'
    os.makedirs(f'{TARGET_BASE_DIR}/{ds_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True,
    )

In [None]:
def file_converter(ds_name):
    schemas = json.load(open(f'{SOURCE_BASE_DIR}/schemas.json'))
    files = glob.glob(f'{SOURCE_BASE_DIR}/{ds_name}/part-*')

    for file in files:
        df = read_csv_to_df(file, schemas)
        file_name =  re.split('[/\\\]', file)[-1]
        convert_to_jsonl(df, ds_name, file_name)

In [None]:
def process_files(ds_names = None):
    if ds_names is None:
        schemas = json.load(open(f'{SOURCE_BASE_DIR}/schemas.json'))
        ds_names = schemas.keys()
    
    for ds_name in ds_names:
        file_converter(ds_name)

In [None]:
process_files()