In [None]:
import os
import sys
notebook_dir = os.getcwd()
print(notebook_dir)
os.chdir('./code')
from data_preprocess import read_rdata_to_df
os.chdir(notebook_dir)


In [None]:
rdata_file = 'data/nhanes_data.RData'
df = read_rdata_to_df(rdata_file)

In [None]:
df.columns

In [None]:
# Select and rename features
feature_mapping = {
    'FEMALE': 'female',
    'AGE': 'baseline_age',
    'CVD_HISTORY': 'cvd_hx_baseline',
    'BLACK': 'black',
    'SMOKE': 'smoke',
    'BMI': 'bmi',
    'SBP': 'sbp',
    'DBP': 'dbp',
    'HR': 'hr',
    'HBA1C': 'hba1c',
    'TCHOL': 'chol',
    'LDL': 'ldl',
    'HDL': 'hdl',
    'TRIG': 'trig',
    'FG': 'fpg',
    'POTASSIUM': 'potassium',
    'SCREAT': 'screat',
    'UACR': 'uacr',
    'BPRX': 'bprx',
    'STATIN': 'statin'
}

# Select and rename columns
df = df[list(feature_mapping.keys())].rename(columns=feature_mapping)
print("Selected features after renaming:")
print(df.columns.tolist())

# Display basic statistics for all variables
print("\nBasic statistics for all variables:")
print(df.describe().round(2))

In [None]:
import joblib
import pandas as pd
import numpy as np

# Dictionary to store all predictions
predictions_dict = {}

# Model names for loading
model_names = {
    'logistic_regression': 'Logistic Regression',
    'random_forest': 'Random Forest',
    'xgboost': 'XGBoost',
    'lightgbm': 'LightGBM',
    'svm': 'SVM'
}

# Load models and get predictions
for model_name in model_names.keys():
    for run in range(1, 51):
        try:
            # Load the model
            run_dir = f'risk_models/run_{run:03d}'
            model_path = os.path.join(run_dir, model_name, f'{model_name}.joblib')
            model = joblib.load(model_path)
            
            # Get predictions
            y_pred_proba = model.predict_proba(df)[:, 1]
            
            # Store predictions with model name and run number
            col_name = f"risk_prediction_{model_name}_run{run}"
            predictions_dict[col_name] = y_pred_proba
            
            print(f"Successfully loaded and predicted with {model_name} from run {run}")
            
        except Exception as e:
            print(f"Error loading {model_name} from run {run}: {str(e)}")

# Create DataFrame with all predictions
risk_predictions = pd.DataFrame(predictions_dict)

# Print the shape and first few rows
print("\nPredictions DataFrame Shape:", risk_predictions.shape)
print("\nFirst few rows of predictions:")
print(risk_predictions.head())

# Basic statistics of predictions
print("\nSummary statistics of predictions:")
print(risk_predictions.describe())

In [None]:
# Combine original features with risk predictions
combined_df = pd.concat([df, risk_predictions], axis=1)

# Print info about the combined dataset
print("Combined DataFrame Shape:", combined_df.shape)
print("\nColumns in combined dataset:")
print("Original features:", df.columns.tolist())
print("\nRisk prediction columns:", risk_predictions.columns.tolist())
print("\nFirst few rows of combined dataset:")
print(combined_df.head())

# Save the combined dataframe to CSV
output_path = os.path.join('data', 'nhanes_data_prediction.csv')
combined_df.to_csv(output_path, index=False)

print(f"Saved combined dataset to {output_path}")
print(f"File shape: {combined_df.shape[0]} rows × {combined_df.shape[1]} columns")
print(f"Contains {len(df.columns)} original features and {len(risk_predictions.columns)} risk prediction columns")