In [None]:
import pandas as pd
import numpy as np

sample_data = pd.read_csv('physionet_wo_missing.csv')

def generate_synthetic_physionet_data(sample_df, num_records=100):

    synthetic_df = pd.DataFrame(columns=sample_df.columns)
    
    # Binary columns
    binary_cols = [
        're.admission.within.6.months', 'return.to.emergency.department.within.6.months',
        're.admission.within.3.months', 're.admission.within.28.days', 
        'death.within.6.months', 'death.within.3.months', 'death.within.28.days',
        'moderate.to.severe.chronic.kidney.disease', 'diabetes', 'dementia'
    ]
    
    # AI generated
    binary_probs = {}
    for col in binary_cols:
        if col in sample_df.columns:
            binary_probs[col] = sample_df[col].mean()
    
    # Generate synthetic data for each column
    for col in sample_df.columns:
        if col == 'Unnamed: 0':
            # Generate sequential IDs
            synthetic_df[col] = list(range(1, num_records + 1))
        
        elif col in binary_cols:
            # Generate binary values based on observed probabilities
            synthetic_df[col] = np.random.binomial(1, binary_probs.get(col, 0.1), num_records)
        
        elif col == 'dischargeDay':
            # Generate discharge days between 1-30
            synthetic_df[col] = np.random.randint(1, 31, num_records)
        
        elif col == 'CCI.score':
            # Charlson Comorbidity Index score - typically 0-15
            weights = [0.6, 0.2, 0.1, 0.05, 0.03, 0.02]
            weights = weights + [0.0] * (16 - len(weights))  # Pad to length 16
            # Normalize weights to ensure they sum to 1
            weights = [w/sum(weights) for w in weights]
            synthetic_df[col] = np.random.choice(range(16), num_records, p=weights)
        
        elif col == 'visit.times':
            # Typically 1-5 visits
            weights = [0.7, 0.2, 0.05, 0.03, 0.02]
            # Normalize weights to ensure they sum to 1
            weights = [w/sum(weights) for w in weights]
            synthetic_df[col] = np.random.choice(range(1, 6), num_records, p=weights)
        
        elif col == 'GCS' or col == 'eye.opening' or col == 'verbal.response':
            # Glasgow Coma Scale components
            if col == 'GCS':
                # Total GCS typically 3-15, with most patients at 15
                # Generate weights for all values from 3 to 15 (13 values total)
                weights = [0.01] * 12  # For values 3-14
                weights.append(0.88)   # For value 15
                # Normalize weights to ensure they sum to 1
                weights = [w/sum(weights) for w in weights]
                synthetic_df[col] = np.random.choice(range(3, 16), num_records, p=weights)
            elif col == 'eye.opening':
                # Eye opening score 1-4
                weights = [0.01, 0.01, 0.08, 0.9]
                # Normalize weights to ensure they sum to 1
                weights = [w/sum(weights) for w in weights]
                synthetic_df[col] = np.random.choice(range(1, 5), num_records, p=weights)
            elif col == 'verbal.response':
                # Verbal response score 1-5
                weights = [0.01, 0.01, 0.01, 0.07, 0.9]
                # Normalize weights to ensure they sum to 1
                weights = [w/sum(weights) for w in weights]
                synthetic_df[col] = np.random.choice(range(1, 6), num_records, p=weights)
        
        elif col == 'fio2':
            # FiO2 typically 21, 28, 33, 40, etc.
            common_values = [21, 24, 28, 32, 33, 35, 40, 45, 50, 60, 70, 80, 90, 100]
            weights = [0.05, 0.03, 0.6, 0.05, 0.1, 0.05, 0.05, 0.02, 0.02, 0.01, 0.01, 0.005, 0.005, 0.005]
            # Normalize weights to ensure they sum to 1
            weights = [w/sum(weights) for w in weights]
            synthetic_df[col] = np.random.choice(common_values, num_records, p=weights)
        
        else:
            # For continuous variables, use the min, max, and distribution from sample data
            if col in sample_df.columns:
                min_val = sample_df[col].min()
                max_val = sample_df[col].max()
                mean_val = sample_df[col].mean()
                std_val = sample_df[col].std()
                
                # Use normal distribution with limits
                values = np.random.normal(mean_val, std_val, num_records)
                values = np.clip(values, min_val, max_val)
                
                # Round to reasonable decimal places
                if col in ['basophil.ratio', 'basophil.count', 'eosinophil.ratio', 'high.sensitivity.troponin']:
                    values = np.round(values, 3)
                else:
                    values = np.round(values, 2)
                
                synthetic_df[col] = values
    
    # Ensure logical consistency
    for i in range(num_records):
        # If death within 28 days, then death within 3 months and 6 months
        if synthetic_df.loc[i, 'death.within.28.days'] == 1:
            synthetic_df.loc[i, 'death.within.3.months'] = 1
            synthetic_df.loc[i, 'death.within.6.months'] = 1
        
        # If death within 3 months, then death within 6 months
        elif synthetic_df.loc[i, 'death.within.3.months'] == 1:
            synthetic_df.loc[i, 'death.within.6.months'] = 1
        
        # If readmission within 28 days, then readmission within 3 months and 6 months
        if synthetic_df.loc[i, 're.admission.within.28.days'] == 1:
            synthetic_df.loc[i, 're.admission.within.3.months'] = 1
            synthetic_df.loc[i, 're.admission.within.6.months'] = 1
            synthetic_df.loc[i, 'return.to.emergency.department.within.6.months'] = 1
        
        # If readmission within 3 months, then readmission within 6 months
        elif synthetic_df.loc[i, 're.admission.within.3.months'] == 1:
            synthetic_df.loc[i, 're.admission.within.6.months'] = 1
            synthetic_df.loc[i, 'return.to.emergency.department.within.6.months'] = 1
    
    return synthetic_df

# Generate synthetic data
synthetic_data = generate_synthetic_physionet_data(sample_data, num_records=100)

# Save the synthetic data to a CSV file
synthetic_data.to_csv('synthetic_physionet_data.csv', index=False)

# Display first few rows to verify
synthetic_data.head()

Unnamed: 0.1,Unnamed: 0,re.admission.within.6.months,return.to.emergency.department.within.6.months,re.admission.within.3.months,re.admission.within.28.days,death.within.6.months,death.within.3.months,dischargeDay,death.within.28.days,uric.acid,...,eye.opening,visit.times,creatinine.enzymatic.method,hematocrit,GCS,coefficient.of.variation.of.red.blood.cell.distribution.width,verbal.response,fio2,eosinophil.ratio,high.sensitivity.troponin
0,1,0,0,0,0,0,0,18,0,546.61,...,4,1,83.51,0.27,15,11.9,5,28,0.0,0.733
1,2,0,0,0,0,0,0,24,0,508.84,...,4,1,149.31,0.23,15,17.15,4,33,0.074,0.649
2,3,1,0,0,0,0,0,13,0,279.88,...,4,1,58.13,0.25,15,15.31,5,28,0.01,0.0
3,4,0,1,0,0,0,0,22,0,409.84,...,4,1,49.2,0.24,15,14.44,5,21,0.029,1.695
4,5,1,1,0,0,1,0,15,0,625.03,...,4,1,125.57,0.37,15,14.3,5,21,0.0,0.0
