In [1]:
import os
import pandas as pd
import numpy as np
path = "/groups/umcg-lifelines/tmp02/projects/ov20_0110/Lifelines"

# # List contents
# print(os.listdir(path))

df = pd.read_csv('df_cleaned.csv')
df

# value_counts = df['CVD'].value_counts()
# value_counts

In [3]:
df.columns

Index(['Diastolic blood pressure', 'Systolic blood pressure', 'Age',
       'Cholesterol', 'HDL cholesterol', 'LDL cholesterol', 'Triglycerides',
       'Glucose', 'Glycated haemoglobin', 'Childhood trauma score',
       'Depressive symptoms score', 'Anxiety symptoms score',
       'Physically abused by family as a child',
       'Felt hated by family member as a child',
       'Sexually molested as a child',
       'Someone to take to doctor when needed as a child', 'Felt loved',
       'Hypertension', 'Smoking status', 'Physical activity', 'Depressed mood',
       'Anhedonia', 'Appetite changes', 'Sleep problems',
       'Psychomotor changes', 'Fatigue', 'Feelings of inadequacy',
       'Cognitive problems', 'Suicidal ideation', 'Anxiety', 'Restlessness',
       'Lack of relaxation', 'Concentration problems', 'Irritability',
       'Antidepressant use', 'Diabetes', 'Gender', 'CVD Family history',
       'CVD'],
      dtype='object')

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)
tf.random.set_seed(42)

class DataGenerator:
    def __init__(self, original_df, target_case_count=10000, target_control_count=10000,
                 ordinal_columns=None, categorical_columns=None, numerical_columns=None,
                 latent_dim=100, generator_optimizer=tf.keras.optimizers.Adam(1e-4),
                 discriminator_optimizer=tf.keras.optimizers.Adam(1e-4),
                 epochs=1000, batch_size=32):
        
        self.original_df = original_df.copy()
        self.target_case_count = target_case_count
        self.target_control_count = target_control_count
        self.ordinal_columns = ordinal_columns or []
        self.categorical_columns = categorical_columns or []
        self.numerical_columns = numerical_columns or []
        self.latent_dim = latent_dim
        self.generator_optimizer = generator_optimizer
        self.discriminator_optimizer = discriminator_optimizer
        self.epochs = epochs
        self.batch_size = batch_size

        # Preprocessing tools
        self.scaler = MinMaxScaler()
        self.ordinal_encoder = OrdinalEncoder()
        self.categorical_encoder = OneHotEncoder(handle_unknown='ignore')

        # Models
        self.generator = None
        self.discriminator = None

        # Save fitted encoders and column names
        self.encoded_cat_columns = []

    def balance_data(self):
        cases = self.original_df[self.original_df['CVD'] == 1]
        controls = self.original_df[self.original_df['CVD'] == 0]

        print(f"Original cases: {len(cases)}")
        print(f"Original controls: {len(controls)}")

        undersampled_controls = controls.sample(
            n=self.target_control_count, replace=False, random_state=42)

        num_synthetic_cases = max(0, self.target_case_count - len(cases))
        print(f"Generating {num_synthetic_cases} synthetic cases")

        features = cases.drop(columns=['CVD'])
        processed_features = self._preprocess_features(features, fit=True)

        self._initialize_gan(processed_features)
        self._train_gan(processed_features.astype(np.float32))

        synthetic_data = self._generate_synthetic_data(num_synthetic_cases)
        synthetic_df = self._postprocess_features(synthetic_data)
        synthetic_df['CVD'] = 1

        balanced_df = pd.concat([cases, synthetic_df, undersampled_controls], ignore_index=True)
        balanced_df = shuffle(balanced_df, random_state=42)

        return balanced_df, cases.reset_index(drop=True), synthetic_df.reset_index(drop=True)

    def _preprocess_features(self, df, fit=False):
        df = df.copy()
    
        if self.categorical_columns:
            # Check all expected categorical columns are present
            missing = [col for col in self.categorical_columns if col not in df.columns]
            if missing:
                raise ValueError(f"Missing categorical columns: {missing}")
    
            if fit:
                self.categorical_encoder.fit(df[self.categorical_columns])
                self.encoded_cat_columns = self.categorical_encoder.get_feature_names_out(self.categorical_columns)
    
            encoded_array = self.categorical_encoder.transform(df[self.categorical_columns])
    
            # Convert to dense if sparse matrix (OneHotEncoder may return sparse)
            if hasattr(encoded_array, "toarray"):
                encoded_array = encoded_array.toarray()
    
            # Check shape before creating DataFrame
            actual_shape = encoded_array.shape[1]
            expected_shape = len(self.encoded_cat_columns)
    
            if actual_shape != expected_shape:
                raise ValueError(
                    f"Mismatch in encoded shape: got {actual_shape}, expected {expected_shape}.\n"
                    f"Most likely cause: some categories seen during fit() are missing in current data."
                )
    
            encoded_df = pd.DataFrame(encoded_array, columns=self.encoded_cat_columns, index=df.index)
    
            df = df.drop(columns=self.categorical_columns)
            df = pd.concat([df, encoded_df], axis=1)

        if self.numerical_columns:
            if fit:
                self.scaler.fit(df[self.numerical_columns])
            df[self.numerical_columns] = self.scaler.transform(df[self.numerical_columns])

        return df

    def _postprocess_features(self, generated_data):
        df = pd.DataFrame(generated_data, columns=self._get_full_column_order())
        out_df = pd.DataFrame()

        if self.numerical_columns:
            out_df[self.numerical_columns] = self.scaler.inverse_transform(df[self.numerical_columns])

        if self.ordinal_columns:
            out_df[self.ordinal_columns] = np.round(df[self.ordinal_columns]).astype(int)
            for col in self.ordinal_columns:
                min_val = int(self.original_df[col].min())
                max_val = int(self.original_df[col].max())
                out_df[col] = out_df[col].clip(min_val, max_val)

        if self.categorical_columns:
            cat_data = df[self.encoded_cat_columns].values
            decoded = self.categorical_encoder.inverse_transform(cat_data)
            out_df[self.categorical_columns] = decoded

        return out_df

    def _get_full_column_order(self):
        return (
            self.numerical_columns +
            self.ordinal_columns +
            list(self.encoded_cat_columns)
        )

    def _initialize_gan(self, processed_features):
        input_dim = processed_features.shape[1]
        self.generator = self._build_generator(input_dim)
        self.discriminator = self._build_discriminator(input_dim)

    def _build_generator(self, output_dim):
        return tf.keras.Sequential([
            tf.keras.layers.Input(shape=(self.latent_dim,)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.Dense(output_dim, activation='linear')
        ])

    def _build_discriminator(self, input_dim):
        return tf.keras.Sequential([
            tf.keras.layers.Input(shape=(input_dim,)),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')  # Use sigmoid since from_logits=False
        ])

    def _train_gan(self, real_data):
        cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=False)

        @tf.function
        def train_step(real_samples):
            noise = tf.random.normal([real_samples.shape[0], self.latent_dim])

            with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
                fake_samples = self.generator(noise, training=True)

                real_output = self.discriminator(real_samples, training=True)
                fake_output = self.discriminator(fake_samples, training=True)

                gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
                disc_loss = (
                    cross_entropy(tf.ones_like(real_output), real_output) +
                    cross_entropy(tf.zeros_like(fake_output), fake_output)
                )

            gradients_gen = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
            gradients_disc = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

            self.generator_optimizer.apply_gradients(zip(gradients_gen, self.generator.trainable_variables))
            self.discriminator_optimizer.apply_gradients(zip(gradients_disc, self.discriminator.trainable_variables))

        print(f"Training GAN for {self.epochs} epochs...")
        for epoch in range(self.epochs):
            for i in range(0, real_data.shape[0], self.batch_size):
                batch = real_data[i:i+self.batch_size]
                train_step(batch)

            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch + 1}/{self.epochs} completed")

    def _generate_synthetic_data(self, num_samples):
        noise = tf.random.normal([num_samples, self.latent_dim])
        generated = self.generator(noise, training=False).numpy()
        return generated


if __name__ == "__main__":
    df = pd.read_csv('df_cleaned.csv')
    ordinal_cols = [
        'Childhood trauma score',
        'Depressive symptoms score', 'Anxiety symptoms score',
        'Physically abused by family as a child',
        'Felt hated by family member as a child',
        'Sexually molested as a child',
        'Someone to take to doctor when needed as a child',
        'Felt loved',
        'Hypertension', 'Smoking status', 'Physical activity'
    ]
    
    categorical_cols = [
        'Depressed mood', 'Anhedonia', 'Appetite changes', 'Sleep problems',
        'Psychomotor changes', 'Fatigue', 'Feelings of inadequacy',
        'Cognitive problems', 'Suicidal ideation', 'Anxiety', 'Restlessness',
        'Lack of relaxation',  'Concentration problems',
        'Irritability', 
        'Antidepressant use', 'Diabetes', 'Gender',
        'CVD Family history'
    ]
    
    numerical_cols = [
        'Diastolic blood pressure', 'Systolic blood pressure',
        'Age', 'Cholesterol', 'HDL cholesterol', 'LDL cholesterol', 
        'Triglycerides', 'Glucose', 'Glycated haemoglobin'
    ]

    gen = DataGenerator(
        original_df=df,
        target_case_count=5000,
        target_control_count=5000,
        ordinal_columns=ordinal_cols,
        categorical_columns=categorical_cols,
        numerical_columns=numerical_cols,
        epochs=1000,
        batch_size=32
    )

    balanced_df, real_cases, synthetic_cases = gen.balance_data()


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import mannwhitneyu, chi2_contingency
from IPython.display import display

class DataInspector:
    def __init__(self):
        self.results = pd.DataFrame(columns=['Variable', 'Type', 'Test', 'Statistic', 'P-value', 'Significant'])
        self.figures = []

    def evaluate_statistical_similarity(self, real_df, synthetic_df, numerical_cols, ordinal_cols, categorical_cols):
        """Main evaluation function that generates both table and plots"""
        # Reset results for multiple runs
        self.results = pd.DataFrame(columns=['Variable', 'Type', 'Test', 'Statistic', 'P-value', 'Significant'])
        self.figures = []
        
        # Combine all columns for plotting
        all_cols = numerical_cols + ordinal_cols + categorical_cols
        
        # Create figure for all distributions
        self._create_kde_subplots(real_df, synthetic_df, all_cols)
        
        # Evaluate each variable type
        if numerical_cols:
            self._evaluate_numerical(real_df, synthetic_df, numerical_cols)
        if ordinal_cols:
            self._evaluate_ordinal(real_df, synthetic_df, ordinal_cols)
        if categorical_cols:
            self._evaluate_categorical(real_df, synthetic_df, categorical_cols)
        
        # Display results
        #self._display_results()
        
    def _create_kde_subplots(self, real_df, synthetic_df, all_cols):
        """Create a grid of KDE plots comparing real and synthetic for each variable"""
        if not all_cols:
            return
            
        n_cols = 3  # Number of columns in the subplot grid
        n_rows = int(np.ceil(len(all_cols) / n_cols))
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        if n_rows > 1:
            axes = axes.flatten()
        else:
            axes = [axes] if isinstance(axes, plt.Axes) else axes
        
        for idx, col in enumerate(all_cols):
            if col not in real_df.columns or col not in synthetic_df.columns:
                continue
                
            ax = axes[idx]
            
            # Plot KDE for both real and synthetic data
            sns.kdeplot(real_df[col], label='Real', ax=ax, fill=True, alpha=0.3, color='blue')
            sns.kdeplot(synthetic_df[col], label='Synthetic', ax=ax, fill=True, alpha=0.3, color='orange')
            
            ax.set_title(col)
            ax.set_xlabel('')
            ax.legend()
        
        # Hide any empty subplots
        for idx in range(len(all_cols), len(axes)):
            axes[idx].axis('off')
            
        plt.tight_layout()
        self.figures.append(fig)
        
    def _evaluate_numerical(self, real_df, synthetic_df, numerical_cols):
        """Evaluate numerical variables with KS test"""
        for col in numerical_cols:
            if col not in real_df.columns or col not in synthetic_df.columns:
                continue
                
            # KS Test
            ks_stat, p_value = stats.kstest(real_df[col].dropna(), 
                                         synthetic_df[col].dropna())
            
            # Add to results table
            self.results.loc[len(self.results)] = {
                'Variable': col,
                'Type': 'Numerical',
                'Test': 'KS Test',
                'Statistic': ks_stat,
                'P-value': p_value,
                'Significant': p_value < 0.05
            }

    def _evaluate_ordinal(self, real_df, synthetic_df, ordinal_cols):
        """Evaluate ordinal variables with Mann-Whitney U test"""
        for col in ordinal_cols:
            if col not in real_df.columns or col not in synthetic_df.columns:
                continue
                
            # Mann-Whitney U Test
            stat, p_value = mannwhitneyu(real_df[col].dropna(),
                                      synthetic_df[col].dropna())
            
            # Add to results table
            self.results.loc[len(self.results)] = {
                'Variable': col,
                'Type': 'Ordinal',
                'Test': 'Mann-Whitney U',
                'Statistic': stat,
                'P-value': p_value,
                'Significant': p_value < 0.05
            }

    def _evaluate_categorical(self, real_df, synthetic_df, categorical_cols):
        """Evaluate categorical variables with Chi-Square test"""
        for col in categorical_cols:
            if col not in real_df.columns or col not in synthetic_df.columns:
                continue
                
            # Prepare contingency table
            real_counts = real_df[col].value_counts()
            syn_counts = synthetic_df[col].value_counts()
            all_cats = list(set(real_counts.index) | set(syn_counts.index))
            contingency = pd.DataFrame({
                'Real': real_counts.reindex(all_cats, fill_value=0),
                'Synthetic': syn_counts.reindex(all_cats, fill_value=0)
            })
            
            # Chi-Square Test
            chi2, p_value, _, _ = chi2_contingency(contingency)
            
            # Add to results table
            self.results.loc[len(self.results)] = {
                'Variable': col,
                'Type': 'Categorical',
                'Test': 'Chi-Square',
                'Statistic': chi2,
                'P-value': p_value,
                'Significant': p_value < 0.05
            }

    def _display_results(self):
        """Display all results in organized format"""
        # Create a display copy with formatted numbers
        display_df = self.results.copy()
        display_df['Statistic'] = display_df['Statistic'].apply(lambda x: f"{x:.4f}" if isinstance(x, (int, float)) else x)
        display_df['P-value'] = display_df['P-value'].apply(lambda x: f"{x:.4f}" if isinstance(x, (int, float)) else x)
        
        # Display results table
        print("\n" + "="*80)
        print("Statistical Test Results Summary")
        print("="*80)
        
        def highlight_significant(row):
            return ['background-color: #ffcccc' if row['Significant'] else '' for _ in row]
        
        display(display_df.style.apply(highlight_significant, axis=1))

# Usage remains the same
inspector = DataInspector()
inspector.evaluate_statistical_similarity(
    real_df, 
    synthetic_df,
    numerical_cols=numerical_cols,
    ordinal_cols=ordinal_cols,
    categorical_cols=categorical_cols
)