In [None]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

INPUT_JSON = ## Set input file ##
CLINICAL_CSV = ## Set clinical csv file from vitaldb ##
OUTPUT_JSON = ## Set OUTput file ##
DEFAULT_PPG = [38.21] * 100
DEFAULT_PREOP_DM = 0.0

def load_clinical_data(csv_path):
    if not os.path.exists(csv_path):
        print(f"Clinical CSV not found: {csv_path}. Using default preop_dm = 0.0")
        return pd.DataFrame()
    df = pd.read_csv(csv_path)
    if 'caseid' in df.columns:
        df = df.rename(columns={'caseid': 'case_id'})
    df['case_id'] = df['case_id'].astype(str)
    return df.set_index('case_id')

def get_preop_dm(case_id, clinical_df):
    if clinical_df.empty or case_id not in clinical_df.index:
        return DEFAULT_PREOP_DM
    val = clinical_df.loc[case_id].get('preop_dm', DEFAULT_PREOP_DM)
    try:
        val = float(val)
        return 1.0 if val > 0 else 0.0
    except (ValueError, TypeError):
        return DEFAULT_PREOP_DM

def repair_json_file(input_path, clinical_df, output_path):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input JSON not found: {input_path}")
    
    with open(input_path, 'r') as f:
        data = json.load(f)
    
    total_cases = len(data)
    issues_found = 0
    issues_fixed = 0
    repaired_data = {}
    
    print(f"Repairing {os.path.basename(input_path)}...")
    
    for case_id in tqdm(data.keys(), total=total_cases, desc="Cases"):
        case_data = data[case_id]
        if not isinstance(case_data, dict):
            issues_found += 1
            repaired_data[case_id] = {}
            continue
        
        repaired_case = {}
        preop_dm = get_preop_dm(case_id, clinical_df)
        
        for uid_str, segments in case_data.items():
            uid = int(uid_str)
            if not isinstance(segments, list):
                issues_found += 1
                repaired_case[uid_str] = []
                continue
            
            repaired_segments = []
            for seg in segments:
                if not isinstance(seg, dict):
                    issues_found += 1
                    issues_fixed += 1
                    seg = {}
                
                ppg = seg.get('data')
                if (not isinstance(ppg, list) or
                    len(ppg) != 100 or
                    any(not isinstance(x, (int, float)) or np.isnan(x) or np.isinf(x) for x in ppg)):
                    ppg = DEFAULT_PPG
                    issues_fixed += 1
                    issues_found += 1
                else:
                    ppg = [float(x) for x in ppg]
                
                repaired_seg = {
                    "data": ppg,
                    "preop_dm": preop_dm
                }
                
                for key in ['segment_idx', 'start_time_sec', 'center_time_sec']:
                    if key in seg:
                        repaired_seg[key] = seg[key]
                
                repaired_segments.append(repaired_seg)
            
            repaired_case[uid_str] = repaired_segments
        
        repaired_data[case_id] = repaired_case
    
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(repaired_data, f, indent=2)
    
    print(f"Repaired file saved: {output_path}")
    print(f"Issues found : {issues_found}")
    print(f"Issues fixed : {issues_fixed}")
    
    return issues_found, issues_fixed

def main():
    clinical_df = load_clinical_data(CLINICAL_CSV)
    issues_found, issues_fixed = repair_json_file(INPUT_JSON, clinical_df, OUTPUT_JSON)
    
    if issues_found == 0:
        print("No issues detected.")
    elif issues_found == issues_fixed:
        print("All fixed")
    else:
        print("Not all fixed")
    
    print("Repair complete.")

if __name__ == "__main__":
    main()