In [3]:
from src.simulator import SyntheticDataGenerator, DataGenerationConfig

# Generate synthetic data
config = DataGenerationConfig(
    n_samples=10000,
    n_sites=10,
    n_timepoints=1,
    n_predictive_continuous=8,
    n_predictive_categorical=8,
    n_noise_continuous=8,
    n_noise_categorical=8,
    classification_noise=0.1,
    include_interactions=True,
    include_nonlinear=True,
    feature_effect_size=1,
    interaction_probability=0.3,
    interaction_effect_size=0.5,
    max_interaction_order=5,
    subgroup_effect_size=0.5,
    feature_specific_subgroup_scale=0.5,
    hierarchical_effect_scale=0.7,
    site_prevalence_range=(0.1, 0.4),
    missing_rate=0.1,
    random_state=42,
)

generator = SyntheticDataGenerator(config)
data, relationships = generator.generate()

In [4]:
# Print summary statistics
print("Data Shape:", data.shape)
print("Average Outcome Rate:", data['outcome'].mean())

Data Shape: (10000, 36)
Average Outcome Rate: 0.2493


In [10]:
print("Feature Importance:")
generator.get_feature_importance()

Feature Importance:


Unnamed: 0,feature,importance,type
5038,subgroup_main_M_0-9,1.363052e+00,subgroup_main
5046,subgroup_main_F_60-69,1.217548e+00,subgroup_main
5032,subgroup_main_M_20-29,1.112813e+00,subgroup_main
5035,subgroup_main_M_60-69,1.089613e+00,subgroup_main
5050,subgroup_main_F_90-99,9.428546e-01,subgroup_main
...,...,...,...
1978,pred_cont_0+pred_cont_3+pred_cat_1+pred_cat_3+...,2.992644e-06,interaction
3896,pred_cont_3+pred_cont_5+pred_cat_1+pred_cat_4+...,1.579328e-06,interaction
2634,pred_cont_1+pred_cont_3+pred_cont_5+pred_cat_0...,1.220402e-06,interaction
4885,pred_cont_7+pred_cat_3+pred_cat_7+age_group+sex,8.596449e-07,interaction


In [11]:
print("Site Statistics:")
generator.get_site_statistics()

Site Statistics:


Unnamed: 0,site,n_samples,outcome_rate,missing_rate
0,site_0,1053,0.366572,0.094938
1,site_1,985,0.299492,0.098908
2,site_2,996,0.199799,0.096811
3,site_3,971,0.23275,0.098867
4,site_4,962,0.16632,0.096138
5,site_5,1021,0.133203,0.09735
6,site_6,1017,0.26647,0.096868
7,site_7,967,0.399173,0.095923
8,site_8,994,0.332998,0.096092
9,site_9,1034,0.099613,0.095012


In [12]:
print("Subgroup Statistics:")
generator.get_subgroup_statistics()

Subgroup Statistics:


Unnamed: 0,sex,age_group,n_samples,n_patients,outcome_rate,missing_rate
0,F,0-9,18,18,0.277778,0.104377
1,F,10-19,95,95,0.431579,0.097608
2,F,20-29,339,339,0.486726,0.100742
3,F,30-39,824,824,0.116505,0.096646
4,F,40-49,1230,1230,0.219512,0.097512
5,F,50-59,1216,1216,0.133224,0.095544
6,F,60-69,788,788,0.521574,0.095639
7,F,70-79,378,378,0.412698,0.089466
8,F,80-89,87,87,0.287356,0.091257
9,F,90-99,21,21,0.380952,0.066378


In [13]:
data.head(10)

Unnamed: 0,site,sex,age_group,pred_cont_0,pred_cont_1,pred_cont_2,pred_cont_3,pred_cont_4,pred_cont_5,pred_cont_6,...,noise_cont_7,noise_cat_0,noise_cat_1,noise_cat_2,noise_cat_3,noise_cat_4,noise_cat_5,noise_cat_6,noise_cat_7,outcome
0,site_6,F,50-59,0.147706,-0.911677,-0.075848,1.036306,0.419115,0.949613,-0.876967,...,-0.044212,0.0,2,0.0,0.0,,0.0,0,1.0,0.0
1,site_3,F,40-49,-0.833138,0.399127,-1.857056,0.661869,,0.312332,-0.46446,...,2.401784,0.0,1,3.0,0.0,1.0,0.0,3,1.0,1.0
2,site_7,M,50-59,,-0.497766,-0.510832,0.261482,3.035493,0.603012,0.657234,...,0.626899,1.0,1,3.0,0.0,,2.0,3,0.0,1.0
3,site_4,F,40-49,-0.78174,0.353797,-0.245343,-0.450238,1.447067,-1.836961,-0.270923,...,0.814998,1.0,1,,0.0,0.0,2.0,3,0.0,0.0
4,site_6,F,20-29,-1.027774,-1.913272,,,1.327225,-0.173652,0.202914,...,0.128313,0.0,1,3.0,1.0,2.0,,4,0.0,0.0
5,site_9,M,50-59,-0.646687,,0.512151,0.108315,-1.275375,-0.139152,0.070406,...,0.172653,2.0,1,1.0,0.0,0.0,1.0,2,0.0,0.0
6,site_2,F,30-39,-1.325884,0.746907,-0.752082,-1.144344,-1.620709,1.68392,,...,1.251335,2.0,2,3.0,0.0,1.0,1.0,3,,1.0
7,site_6,F,50-59,-0.39514,0.619531,-0.774206,0.732223,,,-0.42814,...,-0.429297,1.0,1,0.0,0.0,0.0,,1,1.0,0.0
8,site_7,F,70-79,-0.713249,,1.306974,-1.774313,-0.30445,-0.660151,-0.7258,...,-1.872798,0.0,1,,1.0,2.0,0.0,1,0.0,1.0
9,site_4,F,40-49,,0.915318,-0.293908,0.306988,1.847726,-0.718864,-1.647372,...,0.23328,,0,,,1.0,1.0,3,1.0,0.0


In [14]:
# Linear classifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import pandas as pd

# Split data
X = data.drop(columns=['outcome'])#[['pred_cont_0']]
# Convert categorical columns to one-hot encoding
X = pd.get_dummies(X)
y = data['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# Mean imputation
X_mean = X_train.mean()
X_train = X_train.fillna(X_mean)
X_val = X_val.fillna(X_mean)
X_test = X_test.fillna(X_mean)
# Train classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.8210
ROC AUC: 0.8385


In [15]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

# Train classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.8160
ROC AUC: 0.8478


In [16]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

def prepare_xgboost_data(data: pd.DataFrame, categorical_columns=None):
    """
    Prepare data for XGBoost by properly encoding categorical variables.

    Args:
        data: Input DataFrame
        categorical_columns: List of categorical column names. If None, will detect automatically.

    Returns:
        Tuple of (processed DataFrame, dict of label encoders)
    """
    if categorical_columns is None:
        categorical_columns = data.select_dtypes(include=['category', 'object']).columns

    processed_data = data.copy()
    label_encoders = {}

    # Encode categorical variables
    for column in categorical_columns:
        if column in processed_data.columns:  # Skip if column was dropped
            le = LabelEncoder()
            processed_data[column] = le.fit_transform(processed_data[column].astype(str))
            label_encoders[column] = le

    return processed_data, label_encoders

# Prepare the data
categorical_columns = data.select_dtypes(include=['category']).columns
X = data.drop(columns=['outcome'])
y = data['outcome']

# Process the data
X_processed, label_encoders = prepare_xgboost_data(X, categorical_columns)

# Split the processed data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Configure XGBoost classifier with categorical feature support
params = {
    'objective': 'binary:logistic',
    'random_state': 42,
    'eval_metric': 'auc',
    'enable_categorical': True,
}

# Train classifier
clf = xgb.XGBClassifier(**params)
clf.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Predict
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Get feature importance
importance_df = pd.DataFrame({
    'feature': X_processed.columns,
    'importance': clf.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(importance_df.sort_values('importance', ascending=False).head(10))

Accuracy: 0.8345
ROC AUC: 0.8561

Top 10 Most Important Features:
        feature  importance
2     age_group    0.125353
0          site    0.060844
1           sex    0.052735
18   pred_cat_7    0.043751
17   pred_cat_6    0.040036
16   pred_cat_5    0.038444
8   pred_cont_5    0.035869
12   pred_cat_1    0.035783
5   pred_cont_2    0.033609
11   pred_cat_0    0.032380


In [17]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

def prepare_catboost_data(data: pd.DataFrame):
    """
    Prepare data for CatBoost by identifying categorical columns and handling NaN values.

    Args:
        data: Input DataFrame

    Returns:
        Tuple of (processed DataFrame, list of categorical feature indices)
    """
    # Make a copy to avoid modifying the original data
    processed_data = data.copy()

    # Identify categorical columns
    categorical_columns = processed_data.select_dtypes(include=['category', 'object']).columns

    # Handle categorical columns
    for col in categorical_columns:
        if col in processed_data.columns:  # Skip if column was dropped
            # Convert to string and handle NaN values
            processed_data[col] = processed_data[col].astype(str)
            # Replace 'nan' strings with a specific value
            processed_data[col] = processed_data[col].replace('nan', 'Missing')

    # Get indices of categorical columns
    categorical_features_indices = [
        processed_data.columns.get_loc(col) for col in categorical_columns
        if col in processed_data.columns
    ]

    return processed_data, categorical_features_indices

# Prepare the data
X = data.drop(columns=['outcome'])
y = data['outcome']

# Get categorical feature indices and process the data
X_processed, cat_features = prepare_catboost_data(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Create Pool objects with processed data
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# Configure CatBoost classifier
params = {
    'random_seed': 42,
    'verbose': False,
    'eval_metric': 'AUC',
    'early_stopping_rounds': 50,
    'nan_mode': 'Min'  # Specify how to handle NaN in numeric features
}

# Train classifier
clf = CatBoostClassifier(**params)
clf.fit(
    train_pool,
    eval_set=val_pool,
    plot=False
)

# Predict
y_pred = clf.predict(test_pool)
y_pred_proba = clf.predict_proba(test_pool)[:, 1]

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Get feature importance
importance_df = pd.DataFrame({
    'feature': X_processed.columns,
    'importance': clf.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(importance_df.sort_values('importance', ascending=False).head(10))

Accuracy: 0.8340
ROC AUC: 0.8563

Top 10 Most Important Features:
        feature  importance
2     age_group   17.162215
0          site   11.600372
8   pred_cont_5    8.536799
5   pred_cont_2    7.730142
18   pred_cat_7    4.236939
1           sex    3.808202
17   pred_cat_6    3.732482
16   pred_cat_5    3.624105
11   pred_cat_0    3.280939
7   pred_cont_4    3.145828
