In [1]:
import pandas as pd
import numpy as np

In [3]:
independent_data = pd.read_csv('../Synthetic_data/ctgan/synthetic_ctgan_test_check_v3_imbalance_independent.csv')

In [4]:
independent_data

Unnamed: 0,patient_ids,age,gender,Feature_A
0,100,79,Male,Low
1,52,78,Male,Normal
2,69,66,Male,Normal
3,63,20,Female,Low
4,67,64,Female,Low
...,...,...,...,...
95,40,57,Male,Normal
96,84,42,Female,Low
97,73,48,Female,Low
98,100,38,Male,Normal


In [5]:
independent_data.duplicated().sum()

5

In [6]:
independent_data.drop_duplicates(inplace=True)

In [7]:
independent_data.shape

(95, 4)

In [8]:
Feature_A = independent_data['Feature_A']

# Generate Feature_B (functionally dependent on Feature_A (one-to-one))
Feature_B = np.where(Feature_A == "Low", 'class_0', 
            np.where(Feature_A == "Normal", 'class_1',
            np.where(Feature_A == "High", 'class_2','Unknown')))

# Generate Feature_C (functionally dependent on Feature_B (many-to-one))
Feature_C = np.where(Feature_B == "class_0", "No", "Yes")

# Generate Feature_D (functionally dependent on Feature_A (one-to-many))
# Define mapping for each category in Feature_A
Feature_D_mapping = {
    "Low": {
        "values": ["Med_A", "Med_B"],  # Possible values for "Low"
        "probabilities": [0.7, 0.3]   # Probabilities for Med_A and Med_B
    },
    "Normal": {
        "values": ["Med_C", "Med_E"],  # Possible values for "Normal"
        "probabilities": [0.5, 0.5]    # Probabilities for Med_C and Med_E
    },
    "High": {
        "values": ["Med_A", "Med_D"],  # Possible values for "High"
        "probabilities": [0.4, 0.6]    # Probabilities for Med_A and Med_D
    }
}

Feature_D = np.array([
    np.random.choice(Feature_D_mapping[group]["values"], p=Feature_D_mapping[group]["probabilities"])
    for group in Feature_A
])

# Generate Feature_E (functionally dependent on Feature_C (one-to-one))
Feature_E = np.where(Feature_C == "Yes", 'present', 
            np.where(Feature_C == "No", 'absent','Unknown'))

Feature_F_mapping = {
    "absent": {
        "values": ["class_a", "class_b"],  # Possible values for "absent"
        "probabilities": [0.5, 0.5]   # Probabilities for class_a and class_b
    },
    "present": {
        "values": ["class_c", "class_d"],  # Possible values for "present"
        "probabilities": [0.6, 0.4]    # Probabilities for class_c and class_d
    }
}

Feature_F = np.array([
    np.random.choice(Feature_F_mapping[group]["values"], p=Feature_F_mapping[group]["probabilities"])
    for group in Feature_E
])


# Generate Feature_G (many-to-one with Feature_D using the specified mapping)
Feature_G_mapping = {
    "Med_A": "type_0",
    "Med_B": "type_1",
    "Med_C": "type_1",
    "Med_D": "type_2",
    "Med_E": "type_2"
}
Feature_G = np.array([Feature_G_mapping[d] for d in Feature_D])

# Generate Feature_H (many-to-one combined from Feature_A and Feature_E)
Feature_H_conditions = [
    (np.isin(Feature_A, ["Low"]) & np.isin(Feature_E, ["absent"])),
    (np.isin(Feature_A, ["Normal", "High"]) & np.isin(Feature_E, ["present"]))
]
Feature_H_choices = ["no_pain", "pain"]
Feature_H = np.select(Feature_H_conditions, Feature_H_choices, default="no_pain")

# Generate Feature_I (conditional rule based on Feature_G and Feature_H)
Feature_I_conditions = [
    (Feature_G == "type_2") & (Feature_H == "pain")
]
Feature_I_choices = ["B"]
Feature_I = np.select(Feature_I_conditions, Feature_I_choices, default="A")

# Generate Feature_J (one-to-one with Feature_I)
Feature_J = np.where(Feature_I == "A", "X", "Y")

# Generate Feature_K (many-to-one with Feature_D)
Feature_K_mapping = {
    "Med_A": "positive",
    "Med_B": "negative",
    "Med_C": "positive",
    "Med_D": "negative",
    "Med_E": "positive"
}
Feature_K = np.array([Feature_K_mapping[d] for d in Feature_D])

# Generate Feature_L (many-to-one with Feature_F)
Feature_L_mapping = {
    "class_a": "stage_1",
    "class_b": "stage_1",
    "class_c": "stage_2",
    "class_d": "stage_2"
}
Feature_L = np.array([Feature_L_mapping[f] for f in Feature_F])

In [9]:
independent_data['Feature_B']= Feature_B
independent_data['Feature_C']= Feature_C
independent_data['Feature_D']= Feature_D
independent_data['Feature_E']= Feature_E
independent_data['Feature_F']= Feature_F
independent_data['Feature_G']= Feature_G
independent_data['Feature_H']= Feature_H
independent_data['Feature_I']= Feature_I
independent_data['Feature_J']= Feature_J
independent_data['Feature_K']= Feature_K
independent_data['Feature_L']= Feature_L

In [10]:
independent_data

Unnamed: 0,patient_ids,age,gender,Feature_A,Feature_B,Feature_C,Feature_D,Feature_E,Feature_F,Feature_G,Feature_H,Feature_I,Feature_J,Feature_K,Feature_L
0,100,79,Male,Low,class_0,No,Med_A,absent,class_a,type_0,no_pain,A,X,positive,stage_1
1,52,78,Male,Normal,class_1,Yes,Med_C,present,class_d,type_1,pain,A,X,positive,stage_2
2,69,66,Male,Normal,class_1,Yes,Med_E,present,class_c,type_2,pain,B,Y,positive,stage_2
3,63,20,Female,Low,class_0,No,Med_B,absent,class_b,type_1,no_pain,A,X,negative,stage_1
4,67,64,Female,Low,class_0,No,Med_B,absent,class_b,type_1,no_pain,A,X,negative,stage_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,40,57,Male,Normal,class_1,Yes,Med_E,present,class_c,type_2,pain,B,Y,positive,stage_2
96,84,42,Female,Low,class_0,No,Med_A,absent,class_a,type_0,no_pain,A,X,positive,stage_1
97,73,48,Female,Low,class_0,No,Med_A,absent,class_b,type_0,no_pain,A,X,positive,stage_1
98,100,38,Male,Normal,class_1,Yes,Med_E,present,class_d,type_2,pain,B,Y,positive,stage_2


independent_data.to_csv('../Synthetic_data/ctgan/synthetic_ctgan_test_v0_dependent_mapping.csv',index=False)

In [11]:
independent_data.to_csv('../Synthetic_data/ctgan/synthetic_ctgan_test_check_v3_imbalance_dependent_mapping.csv',index=False)