### Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

### Code to generate simulated data for case_1 and case_2
- for case_2 just change the no of rows

In [34]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the sample size or number of rows
n_samples = 100

# Configuration for feature generation (include how many features you want to include along with its metadata)

config = {
    "features": {
        "age": {
            "type": "int",
            "min": 18,
            "max": 80
        },
        "gender": {
            "type": "categorical",
            "values": ["Male", "Female"],
            "probabilities": [0.5, 0.5]
        },
        "patient_ids": {
            "type": "id",
            "format": "{:0d}"  # Format for unique IDs
        },
        "Feature_A": {
            "type": "categorical",
            "values": ["Low", "Normal", "High"],
            "probabilities": [0.3, 0.3, 0.4]
        },
        "Feature_B": {
            "type": "one_to_one",  # One-to-one relationship with Feature_A
            "dependency": "Feature_A",
            "mapping": {
                "Low": "class_0",
                "Normal": "class_1",
                "High": "class_2"
            }
        },
        "Feature_C": {
            "type": "many_to_one",  # Many-to-one relationship with Feature_B
            "dependency": "Feature_B",
            "mapping": {
                "class_0": "No",
                "class_1": "Yes",
                "class_2": "Yes"
            }
        },
        "Feature_D": {
            "type": "one_to_many",  # One-to-many relationship with Feature_A
            "dependency": "Feature_A",
            "mapping": {
                "Low": {
                    "values": ["Med_A", "Med_B"],
                    "probabilities": [0.7, 0.3]
                },
                "Normal": {
                    "values": ["Med_C", "Med_E"],
                    "probabilities": [0.5, 0.5]
                },
                "High": {
                    "values": ["Med_A", "Med_D"],
                    "probabilities": [0.4, 0.6]
                }
            }
        }
    }
}

# Generate the data
data = {}
for feature, params in config["features"].items():
    if params["type"] == "int":
        data[feature] = np.random.randint(params["min"], params["max"], size=n_samples).astype(np.int64)
    elif params["type"] == "categorical":
        data[feature] = np.random.choice(params["values"], size=n_samples, p=params["probabilities"])
    elif params["type"] == "id":
        data[feature] = [params["format"].format(i + 1) for i in range(n_samples)]
    elif params["type"] == "one_to_one":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([mapping[val] for val in dependency])
    elif params["type"] == "many_to_one":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([mapping[val] for val in dependency])
    elif params["type"] == "one_to_many":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([
            np.random.choice(mapping[val]["values"], p=mapping[val]["probabilities"])
            for val in dependency
        ])

# Column order with only the requested features
column_order = ["patient_ids", "age", "gender", "Feature_A", "Feature_B", "Feature_C", "Feature_D"]

# Convert to DataFrame
df = pd.DataFrame(data, columns=column_order)
#df['patient_ids'] = df_check['patient_ids'].astype(np.int64)


### Code to generate simulated data for case_3

In [58]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the number of samples
n_samples = 100

# Configuration for feature generation (including Feature_E)
config = {
    "features": {
        "age": {
            "type": "int",
            "min": 18,
            "max": 80
        },
        "gender": {
            "type": "categorical",
            "values": ["Male", "Female"],
            "probabilities": [0.5, 0.5]
        },
        "patient_ids": {
            "type": "id",
            "format": "{:04d}"  # Format for unique IDs
        },
        "Feature_A": {
            "type": "categorical",
            "values": ["Low", "Normal", "High"],
            "probabilities": [0.3, 0.3, 0.4]
        },
        "Feature_B": {
            "type": "one_to_one",  # One-to-one relationship with Feature_A
            "dependency": "Feature_A",
            "mapping": {
                "Low": "class_0",
                "Normal": "class_1",
                "High": "class_2"
            }
        },
        "Feature_C": {
            "type": "many_to_one",  # Many-to-one relationship with Feature_B
            "dependency": "Feature_B",
            "mapping": {
                "class_0": "No",
                "class_1": "Yes",
                "class_2": "Yes"
            }
        },
        "Feature_D": {
            "type": "one_to_many",  # One-to-many relationship with Feature_A
            "dependency": "Feature_A",
            "mapping": {
                "Low": {
                    "values": ["Med_A", "Med_B"],
                    "probabilities": [0.7, 0.3]
                },
                "Normal": {
                    "values": ["Med_C", "Med_E"],
                    "probabilities": [0.5, 0.5]
                },
                "High": {
                    "values": ["Med_A", "Med_D"],
                    "probabilities": [0.4, 0.6]
                }
            }
        },
        "Feature_E": {
            "type": "one_to_one",  # One-to-one relationship with Feature_C
            "dependency": "Feature_C",
            "mapping": {
                "Yes": "present",
                "No": "absent"
            }
        }
    }
}

# Generate the data
data = {}
for feature, params in config["features"].items():
    if params["type"] == "int":
        data[feature] = np.random.randint(params["min"], params["max"], size=n_samples).astype(np.int64)
    elif params["type"] == "categorical":
        data[feature] = np.random.choice(params["values"], size=n_samples, p=params["probabilities"])
    elif params["type"] == "id":
        data[feature] = [params["format"].format(i + 1) for i in range(n_samples)]
    elif params["type"] == "one_to_one":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([mapping[val] for val in dependency])
    elif params["type"] == "many_to_one":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([mapping[val] for val in dependency])
    elif params["type"] == "one_to_many":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([
            np.random.choice(mapping[val]["values"], p=mapping[val]["probabilities"])
            for val in dependency
        ])

# Column order with the requested features including Feature_E
column_order = ["patient_ids", "age", "gender", "Feature_A", "Feature_B", 
                "Feature_C", "Feature_D", "Feature_E"]
# Convert to DataFrame
df = pd.DataFrame(data, columns=column_order)
#df['patient_ids'] = df_check['patient_ids'].astype(np.int64)

### Code to generate simulated data for case_4

In [54]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the number of samples
n_samples = 100

# Configuration for feature generation
config = {
    "features": {
        "age": {
            "type": "int",
            "min": 18,
            "max": 80
        },
        "gender": {
            "type": "categorical",
            "values": ["Male", "Female"],
            "probabilities": [0.8, 0.2]
        },
        "patient_ids": {
            "type": "id",
            "format": "{:04d}"  # Format for unique IDs
        },
        "Feature_A": {
            "type": "categorical",
            "values": ["Low", "Normal", "High"],
            "probabilities": [0.5, 0.3, 0.2]
        },
        "Feature_B": {
            "type": "one_to_one",  # One-to-one relationship with Feature_A
            "dependency": "Feature_A",
            "mapping": {
                "Low": "class_0",
                "Normal": "class_1",
                "High": "class_2"
            }
        },
        "Feature_C": {
            "type": "many_to_one",  # Many-to-one relationship with Feature_B
            "dependency": "Feature_B",
            "mapping": {
                "class_0": "No",
                "class_1": "Yes",
                "class_2": "Yes"
            }
        },
        "Feature_D": {
            "type": "one_to_many",  # One-to-many relationship with Feature_A
            "dependency": "Feature_A",
            "mapping": {
                "Low": {
                    "values": ["Med_A", "Med_B"],
                    "probabilities": [0.7, 0.3]
                },
                "Normal": {
                    "values": ["Med_C", "Med_E"],
                    "probabilities": [0.5, 0.5]
                },
                "High": {
                    "values": ["Med_A", "Med_D"],
                    "probabilities": [0.4, 0.6]
                }
            }
        },
        "Feature_E": {
            "type": "one_to_one",  # One-to-one relationship with Feature_C
            "dependency": "Feature_C",
            "mapping": {
                "Yes": "present",
                "No": "absent"
            }
        },
        "Feature_F": {
            "type": "one_to_many",
            "dependency": "Feature_E",
            "mapping": {
                "absent": {
                    "values": ["class_a", "class_b"],
                    "probabilities": [0.5, 0.5]
                },
                "present": {
                    "values": ["class_c", "class_d"],
                    "probabilities": [0.6, 0.4]
                }
            }
        },
        "Feature_G": {
            "type": "many_to_one",
            "dependency": "Feature_D",
            "mapping": {
                "Med_A": "type_0",
                "Med_B": "type_1",
                "Med_C": "type_1",
                "Med_D": "type_2",
                "Med_E": "type_2"
            }
        },
        "Feature_H": {
            "type": "many_to_one_combined",
            "dependencies": ["Feature_A", "Feature_E"],
            "mapping": {
                ("Low", "absent"): "no_pain",
                ("Normal", "present"): "pain",
                ("High", "present"): "pain",
                "default": "no_pain"
            }
        },
        # NEW: Feature_I based on Feature_G and Feature_H
        "Feature_I": {
            "type": "conditional_rule",
            "dependencies": ["Feature_G", "Feature_H"],
            "rules": [
                {
                    "condition": {"Feature_G": "type_2", "Feature_H": "pain"},
                    "output": "B"
                },
                {
                    "condition": {},  # Catch-all rule
                    "output": "A"
                }
            ]
        },
        "Feature_J": {
            "type": "one_to_one",  # One-to-one relationship with Feature_I
            "dependency": "Feature_I",
            "mapping": {
                "A": "X",
                "B": "Y"
            }
        },
        "Feature_K": {
            "type": "many_to_one",  # many-to-one relationship with gender
            "dependency": "Feature_D",
            "mapping": {
                "Med_A": "positive",
                "Med_B": "negative",
                "Med_C": "positive",
                "Med_D": "negative",
                "Med_E": "positive"
                
            }
        },
        "Feature_L": {
            "type": "many_to_one",  # many-to-one relationship with Feature_F
            "dependency": "Feature_F",
            "mapping": {
                "class_a": "stage_1",
                "class_b": "stage_1",
                "class_c": "stage_2",
                "class_d": "stage_2"
            }
        }
    }
}

# Update the feature generation function
data = {}
for feature, params in config["features"].items():
    if params["type"] == "int":
        data[feature] = np.random.randint(params["min"], params["max"], size=n_samples).astype(np.int64)
    elif params["type"] == "categorical":
        data[feature] = np.random.choice(params["values"], size=n_samples, p=params["probabilities"])
    elif params["type"] == "id":
        data[feature] = [params["format"].format(i + 1) for i in range(n_samples)]
    elif params["type"] == "one_to_one":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([mapping[val] for val in dependency])
    elif params["type"] == "many_to_one":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([mapping[val] for val in dependency])
    elif params["type"] == "one_to_many":
        dependency = data[params["dependency"]]
        mapping = params["mapping"]
        data[feature] = np.array([
            np.random.choice(mapping[val]["values"], p=mapping[val]["probabilities"])
            for val in dependency
        ])
    elif params["type"] == "many_to_one_combined":
        dep1 = data[params["dependencies"][0]]
        dep2 = data[params["dependencies"][1]]
        mapping = params["mapping"]
        default_value = mapping.get("default", "unknown")
        data[feature] = np.array([
            mapping.get((val1, val2), default_value)
            for val1, val2 in zip(dep1, dep2)
        ])
    elif params["type"] == "conditional_rule":
        dep1 = data[params["dependencies"][0]]
        dep2 = data[params["dependencies"][1]]
        rules = params["rules"]
        
        output = []
        for g_val, h_val in zip(dep1, dep2):
            matched = False
            for rule in rules:
                conditions_met = True
                if "Feature_G" in rule["condition"] and g_val != rule["condition"]["Feature_G"]:
                    conditions_met = False
                if "Feature_H" in rule["condition"] and h_val != rule["condition"]["Feature_H"]:
                    conditions_met = False
                
                if conditions_met:
                    output.append(rule["output"])
                    matched = True
                    break
            
            if not matched:
                output.append("A")  # Fallback (shouldn't happen with catch-all rule)
        
        data[feature] = np.array(output)

# Update the column order to include Feature_I
column_order = ["patient_ids", "age", "gender", "Feature_A", "Feature_B", "Feature_C", 
                "Feature_D", "Feature_E", "Feature_F", "Feature_G", "Feature_H", "Feature_I", 
                "Feature_J", "Feature_K", "Feature_L"]

# Convert to DataFrame
df = pd.DataFrame(data, columns=column_order)


In [None]:
df

### Make sure that no duplicate rows are generated

In [None]:
df.duplicated().sum()

### Save the generated simulated data in the ouput directory

In [None]:
df.to_csv(r'Real_simulated_data/test_check_v3_imbalance.csv', index=False)