In [1]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv('converted_data.csv')

In [2]:
high_priority_vars = [
    'Appetite changes',
    'Physically abused by family as a child',
    'Antidepressant use', 
    'CVD Family history'
]

# Keep rows missing ≤1 out of 4 variables (25% missingness allowed)
threshold = 0.25  # Adjust based on your needs (0.10 = 10%)
mask = df[high_priority_vars].isnull().mean(axis=1) <= threshold
df_clean = df[mask]

#% missingness
missing_values_sum = df_clean.isna().sum()
total_rows = df_clean.shape[0]
missing_values_percentage = (missing_values_sum / total_rows) * 100

In [23]:
def analyze_missing_values_and_uniques(df):
    # Calculate missing values percentage (only true NaN values)
    missing_values_sum = df.isna().sum()
    total_rows = df.shape[0]
    missing_values_percentage = (missing_values_sum / total_rows) * 100
    print("Original missing values (%):")
    print(missing_values_percentage)
    
    # Loop through each column
    for column in df.columns:
        print(f"\n--- Unique values for '{column}' ---")
        
        # Special handling for ID columns
        if column == 'project_pseudo_id':
            num_unique = df[column].nunique()
            print(f"Total unique IDs: {num_unique} (expected 1 per participant)")
            print(f"Duplicates found: {df[column].duplicated().sum()}")
            continue
            
        # Get value counts (including '$7' and other strings)
        value_counts = df[column].value_counts(dropna=False).reset_index()
        value_counts.columns = ['Unique Value', 'Count']
        
        # Calculate percentages safely
        total_count = value_counts['Count'].sum()
        
        # Create percentage column with mixed types
        percentages = []
        for count in value_counts['Count']:
            try:
                pct = (float(count) / total_count) * 100
                percentages.append(f"{pct:.2f}%")
            except (TypeError, ValueError):
                percentages.append("N/A")
        
        value_counts['Percentage'] = percentages
        display(value_counts)
    
    return df

In [33]:
# List of continuous variables to analyze
continuous_vars = [
     'Physical activity', 
    'Diastolic blood pressure', 
    'Systolic blood pressure', 
    # 'Age',
    # 'Cholesterol', 
    # 'HDL cholesterol', 
    # 'LDL cholesterol', 
    # 'Triglycerides',
    # 'Glucose', 
    # 'Glycated haemoglobin'
 
]

# Calculate and print stats for each variable
for var in continuous_vars:
    var_mean = df_clean[var].mean()
    var_std = df_clean[var].std()
    
    print(f"\n--- {var} ---")
    print(f"Mean: {var_mean:.2f}")  # Rounded to 2 decimal places
    print(f"Standard Deviation: {var_std:.2f}")
    print(f"Range: {df[var].min():.2f} - {df[var].max():.2f}")  # Bonus: shows min/max


--- Physical activity ---
Mean: 4.35
Standard Deviation: 2.20
Range: 0.00 - 25.00

--- Diastolic blood pressure ---
Mean: 74.55
Standard Deviation: 9.50
Range: 35.00 - 143.00

--- Systolic blood pressure ---
Mean: 127.14
Standard Deviation: 15.93
Range: 72.00 - 225.00


In [7]:

num_cases = df_clean[df_clean['CVD'] == 1].shape[0]

num_cases

#stats = analyze_missing_values_and_uniques(df)
#df.columns

1276

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = df_clean.copy()

# Define columns properly
ordinal_columns = [
    'Childhood trauma score',
    'Depressive symptoms score', 'Anxiety symptoms score',  
    'Physically abused by family as a child',
    'Felt hated by family member as a child',
    'Sexually molested as a child',
    'Someone to take to doctor when needed as a child',
    'Felt loved', 
    'Hypertension', 'Smoking status', 'Physical activity', 
]

true_categorical = [
    'Depressed mood', 'Anhedonia', 'Appetite changes', 'Sleep problems',
    'Psychomotor changes', 'Fatigue', 'Feelings of inadequacy',
    'Cognitive problems', 'Suicidal ideation', 'Anxiety', 'Restlessness',
    'Lack of relaxation',  'Concentration problems',
    'Irritability', 
    'Antidepressant use', 'Diabetes', 'Gender',
    'CVD Family history', 'CVD'
]

continuous_columns = [
    'Diastolic blood pressure', 'Systolic blood pressure',
    'Age', 'Cholesterol', 'HDL cholesterol', 'LDL cholesterol', 'Triglycerides',
    'Glucose', 'Glycated haemoglobin'
]

# Create transformers WITHOUT scaling
numeric_transformer = Pipeline([
    ('imputer', IterativeImputer(max_iter=10, random_state=42))
])

ordinal_transformer = Pipeline([
    ('imputer', IterativeImputer(max_iter=10, random_state=42))
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_columns),
        ('ord', ordinal_transformer, ordinal_columns),
        ('cat', categorical_transformer, true_categorical)
    ],
    remainder='passthrough'
)

# Apply transformations
df_imputed = pd.DataFrame(
    preprocessor.fit_transform(df),
    columns=continuous_columns + ordinal_columns + true_categorical
)

# Convert back to appropriate types
for col in ordinal_columns + continuous_columns:
    df_imputed[col] = df_imputed[col].round().astype(int) if col in ordinal_columns else df_imputed[col].astype(float)

for col in true_categorical:
    df_imputed[col] = df_imputed[col].astype('category')

# Verify
print("Missing values after imputation:")
print(df_imputed.isnull().sum().sum())  # Should be 0
print("\nFirst few rows of imputed data:")
print(df_imputed.head())

In [9]:
import matplotlib.pyplot as plt
# Before imputation (pairwise complete)
corr_before = df_clean[continuous_columns + ordinal_columns + true_categorical].corr(method='spearman')

# After imputation
corr_after = df_imputed[continuous_columns + ordinal_columns + true_categorical].corr(method='spearman')

# Difference in correlations
corr_diff = corr_after - corr_before
print("\nMaximum absolute correlation difference:", corr_diff.abs().max().max())

# Visualize correlation differences
plt.figure(figsize=(10, 8))
plt.imshow(corr_diff, cmap='coolwarm', vmin=-0.3, vmax=0.3)
plt.colorbar()
plt.title("Difference in Correlations (After - Before)")
plt.xticks(range(len(corr_diff.columns)), corr_diff.columns, rotation=90)
plt.yticks(range(len(corr_diff.columns)), corr_diff.columns)
plt.show()

In [10]:
corr_diff_unstacked = corr_diff.unstack().sort_values(key=abs, ascending=False)
print("Top 5 correlation changes:")
print(corr_diff_unstacked.head(5))

Top 5 correlation changes:
Anxiety insomnia           Depressive symptoms score   -0.174348
Depressive symptoms score  Anxiety insomnia            -0.174348
                           Anxiety fatigue             -0.157388
Anxiety fatigue            Depressive symptoms score   -0.157388
                           Anxiety symptoms score      -0.155103
dtype: float64


In [9]:
###Outliers
from sklearn.ensemble import IsolationForest

# Define features for outlier detection
features = [
    'Diastolic blood pressure', 'Systolic blood pressure', 
    'Cholesterol', 'HDL cholesterol', 'LDL cholesterol', 
    'Triglycerides', 'Glucose', 'Glycated haemoglobin'
]

# Extract the subset for outlier detection
features_data = df_imputed[features].copy()

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outlier_flags = iso_forest.fit_predict(features_data)

# Create a mask for inliers (non-outliers)
inlier_mask = outlier_flags != -1  # Inliers are marked as 1

# Get cleaned feature data (outliers removed)
cleaned_features = features_data[inlier_mask].copy()

# Step 1: Get the indices of inliers (to align with original dataframe)
inlier_indices = features_data[inlier_mask].index

# Step 2: Select all columns from the original dataframe, but only keep rows that are inliers
df_cleaned = df_imputed.loc[inlier_indices].copy()

# Verify: The feature columns in df_cleaned should match cleaned_features
assert df_cleaned[features].equals(cleaned_features), "Mismatch in cleaned data!"

df_cleaned

df_cleaned.to_csv("df_cleaned.csv", index=False)