In [6]:
pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [7]:
import pandas as pd
import json
import xmltodict

def parse_csv_data(filepath):
    # Read CSV data into a Pandas DataFrame
    df = pd.read_csv(filepath)

    # Standardize column names and data types
    df.columns = [standardize_column_name(col) for col in df.columns]
    for col in df.columns:
        df[col].fillna('-', inplace=True)

    return df.to_dict('records')

def parse_json_data(filepath):
    # Read JSON data into a Python dictionary
    with open(filepath) as f:
        data = json.load(f)

    # Standardize data structure and values
    standardized_data = standardize_json_data(data)

    return standardized_data

def parse_xml_data(filepath):
    # Convert XML data to a Python dictionary
    with open(filepath) as f:
        xml_data = f.read()

    dict_data = xmltodict.parse(xml_data)

    # Standardize data structure and values
    standardized_data = standardize_xml_data(dict_data)

    return standardized_data

def standardize_column_name(column_name):
    # Replace spaces and special characters with underscores
    column_name = column_name.lower().replace(' ', '_')
    column_name = column_name.replace('-', '_')

    return column_name

def standardize_json_data(data):
    # Recursively standardize data structure and values
    if isinstance(data, dict):
        for key, value in data.items():
            standardized_key = standardize_column_name(key)
            if isinstance(value, list):
                value = [standardize_json_data(item) for item in value]
            elif isinstance(value, dict):
                value = standardize_json_data(value)
            data[standardized_key] = value
    elif isinstance(data, list):
        for i, item in enumerate(data):
            data[i] = standardize_json_data(item)

    return data

def standardize_xml_data(data):
    # Recursively standardize data structure and values
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, list):
                data[key] = [standardize_xml_data(item) for item in value]
            elif isinstance(value, dict):
                data[key] = standardize_xml_data(value)

    return data

def parse_and_transform_data(filepath, format):
    if format == 'csv':
        parsed_data = parse_csv_data('/content/healthcare_dataset.csv')
    # elif format == 'json':
    #     parsed_data = parse_json_data(filepath)
    # elif format == 'xml':
    #     parsed_data = parse_xml_data(filepath)
    else:
        raise ValueError(f"Unsupported data format: {format}")

    return parsed_data


In [15]:
parsed_data = parse_and_transform_data('/content/healthcare_dataset.csv', format='csv')

In [16]:
# Process the standardized data here
print(parsed_data)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:

def store_parsed_data(data, filepath):
  with open(filepath, 'w') as json_file:
    json.dump(data, json_file, indent=4)


# Example usage
parsed_data = parse_and_transform_data('/content/healthcare_dataset.csv', 'csv')
store_parsed_data(parsed_data, '/content/untitled.json')
