In [None]:
import numpy as np
import pandas as pd
import json
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
import shap
import warnings

method = 'TCP_LT_AOF'
data = pd.read_csv(f'../input_data/{method}/data.csv')
with open(f'../results/{method}/final_hyperparameters.json', 'r') as file:
    best_params = json.load(file)

parameters = ['normal_convergence_rate', 
              'subducting_ocean_floor_age',
              'obliquity_of_subduction',
              'migration_rate_x_distance']

X = data[parameters]
y = data['cu_mt']
y_cat = np.where(y > 2, 1, 0)

np.random.seed(42)
CV_iterations = 1000
random_states = np.random.randint(9999, size=CV_iterations)

# Initialize dictionaries with proper structure
shap_values_per_cv = dict()

for sample in X.index:
    shap_values_per_cv[sample] = {}
    for CV_iteration in range(CV_iterations):
        shap_values_per_cv[sample][CV_iteration] = {}

# Lists to store the performance metrics
roc_auc_mean, precision_mean, recall_mean, f1_score_mean = [], [], [], []
roc_auc_std, precision_std, recall_std, f1_score_std = [], [], [], []

# Repeated cross-validations
for i, CV_iteration in enumerate(range(CV_iterations)):
    print('\n------------ CV Repeat number:', CV_iteration)
    
    CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_states[i])
    
    pipeline = Pipeline(steps=[
        ('model', XGBClassifier(seed=42))
    ])
    
    scoring = {
        'roc_auc': 'roc_auc',
        'precision': 'precision_macro',
        'recall': 'recall_macro',
        'f1': 'f1_macro'
    }
    
    scores = cross_validate(pipeline, X, y_cat, cv=CV, scoring=scoring, n_jobs=-1)

    # Store performance metrics
    roc_auc_mean.append(np.mean(scores['test_roc_auc']))
    precision_mean.append(np.mean(scores['test_precision']))
    recall_mean.append(np.mean(scores['test_recall']))
    f1_score_mean.append(np.mean(scores['test_f1']))
    roc_auc_std.append(np.std(scores['test_roc_auc']))
    precision_std.append(np.std(scores['test_precision']))
    recall_std.append(np.std(scores['test_recall']))
    f1_score_std.append(np.std(scores['test_f1']))

    # Store training and test indices
    ix_training, ix_test = [], []
    for fold in CV.split(X, y_cat):
        ix_training.append(fold[0]), ix_test.append(fold[1])
    
    # Process each fold
    for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)):
        X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :]
        y_train, y_test = y_cat[train_outer_ix], y_cat[test_outer_ix]
        
        pipeline.set_params(**best_params)
        fit = pipeline.fit(X_train, y_train)
        
        model = fit.named_steps['model']
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        
        # Store SHAP values and predictions for each test sample
        for j, test_index in enumerate(test_outer_ix):
            shap_values_per_cv[test_index][CV_iteration] = shap_values[j]

# Save performance metrics
performance_metrics = {
    'roc_auc': roc_auc_mean,
    'precision': precision_mean,
    'recall': recall_mean,
    'f1_score': f1_score_mean,
    'roc_auc_std': roc_auc_std,
    'precision_std': precision_std,
    'recall_std': recall_std,
    'f1_score_std': f1_score_std
}

performance_metrics_df = pd.DataFrame(performance_metrics)
performance_metrics_df.to_csv('performance_metrics.csv', index=False)

# Save SHAP values for each sample
for n in range(len(data)):
    shaps_per_obs = pd.DataFrame.from_dict(shap_values_per_cv[n])
    shaps_per_obs.to_csv(f'sample_{n}.csv', index=False)