In [None]:
import os
import sys
import pandas as pd
import numpy as np
notebook_dir = os.getcwd()
print(notebook_dir)
os.chdir('./code')
from data_preprocess import read_rdata_to_df
os.chdir(notebook_dir)


In [None]:
feature_data_blr = pd.read_csv('data/accord_blr.csv')
feature_data_f24 = pd.read_csv('data/accord_f24.csv')
outcome_data = pd.read_csv('data/accord_outcomes.csv')

In [None]:
feature_data_f24.columns

In [None]:
import joblib
import pandas as pd
import numpy as np


# Model names for loading
model_names = {
    'logistic_regression': 'Logistic Regression',
    'random_forest': 'Random Forest',
    'xgboost': 'XGBoost',
    'lightgbm': 'LightGBM',
    'svm': 'SVM'
}

df_to_inference = {
    'accord_blr_risks':feature_data_blr,
    'accord_f24_risks':feature_data_f24
}

for name, feat_data in df_to_inference.items():

    # Load models and get predictions
    df_full = feat_data.copy()
    df = df_full.drop(columns=["MaskID","Visit"])

    # Dictionary to store all predictions
    predictions_dict = {}

    # start inference and append predictions
    for model_name in model_names.keys():
        for run in range(1, 51):
            try:
                # Load the model
                run_dir = f'risk_models/run_{run:03d}'
                model_path = os.path.join(run_dir, model_name, f'{model_name}.joblib')
                model = joblib.load(model_path)
                
                # Get predictions
                y_pred_proba = model.predict_proba(df)[:, 1]
                
                # Store predictions with model name and run number
                col_name = f"risk_prediction_{model_name}_run{run}"
                predictions_dict[col_name] = y_pred_proba
                
                print(f"Successfully loaded and predicted with {model_name} from run {run}")
                
            except Exception as e:
                print(f"Error loading {model_name} from run {run}: {str(e)}")

    # Create DataFrame with all predictions
    risk_predictions = pd.DataFrame(predictions_dict)

    # Print the shape and first few rows
    print("\nPredictions DataFrame Shape:", risk_predictions.shape)
    print("\nFirst few rows of predictions:")
    print(risk_predictions.head())

    # Basic statistics of predictions
    print("\nSummary statistics of predictions:")
    print(risk_predictions.describe())

    combined_df = pd.concat([df_full, risk_predictions], axis=1)
    # Save predictions to CSV with column headers but without row index
    output_path = os.path.join('data', f'{name}.csv')
    combined_df.to_csv(output_path, index=False)

    print(f"Saved predictions to {output_path}")
    print(f"File shape: {risk_predictions.shape[0]} rows x {risk_predictions.shape[1]} columns")