In [1]:
#call it with this
#load deps
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from scipy import stats
import re
import time
from GenerateControlFuncs import *

###Readme

This code will take multithreading, so the more cores you give it the faster it will train.
The outputs can be used in R Step 5 and 6

In [6]:
import os
import pandas as pd
from GenerateControlFuncs import *

# ========================
# 1) Load Data
# ========================
AM = pd.read_csv('../Step1_LoadingAndCleaningData/AM.csv')
FM = pd.read_csv('../Step1_LoadingAndCleaningData/FM.csv')
AP = pd.read_csv('../Step1_LoadingAndCleaningData/AP.csv')
FP = pd.read_csv('../Step1_LoadingAndCleaningData/FP.csv')
AMP = pd.read_csv('../Step1_LoadingAndCleaningData/AMP.csv')
FMP = pd.read_csv('../Step1_LoadingAndCleaningData/FMP.csv')

datasets = {
    'FM': FM,
    'AM': AM,
    'AP': AP,
    'FP': FP,
    'FMP': FMP,
    'AMP': AMP
}

# ========================
# 2) Define Best Params
# ========================
best_params = {
    'FM':  {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.6},
    'AM':  {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.6},
    'AP':  {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 1.0},
    'FP':  {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1.0},
    'FMP': {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.6},
    'AMP': {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
}

# ========================
# 3) Other Settings
# ========================
selected_groups = ["M_Rapa", "M_Aca", "M_Cana", "M_CR", "M_17aE2", "M_Cont_12"]
k_folds = 10
iters = 1   # for example, if you want 20 repeated CV iterations
output_prefix = "XGBoost_Analysis_10k_20i_details_fig1"
cols_to_ignore = [
    'median_lifespan_increase', 'Grp_Sex', 'Lifespan_Increased2', 'Grp', 
    'Mouse', 'ID', 'group', 'Treatment', 'X18198', 'Condition', 'Sex', 
    'Mouse_ID', 'Unnamed: 0'
]

# ========================
# 4) Run the Analysis
# ========================

results = perform_xgboost_analysis_kfold(
    datasets=datasets,
    selected_groups=selected_groups,
    k=10,
    output_prefix=output_prefix,
    cols_to_ignore=cols_to_ignore,
    hyperparam_dict=best_params,  # <-- pass best hyperparams
    iters=2
)

# ========================
# 5) Access the Results
# ========================
all_results = results['all_results']
control_results = results['control_results']
group_labels = results['group_labels']

print("\nSummary of results:")
print(f"All results shape: {all_results.shape}")
print(f"Control results shape: {control_results.shape}")

print("\nGroup labels:")
for dataset_name, labels in group_labels.items():
    print(f"{dataset_name}: {labels}")

# Optional: Additional post-analysis
for dataset_name in all_results['Dataset'].unique():
    print(f"\nDataset: {dataset_name}")
    dataset_res = all_results[all_results['Dataset'] == dataset_name]
    for group in dataset_res['Grp_Sex'].unique():
        grp_preds = dataset_res[dataset_res['Grp_Sex'] == group]['Median_Prediction']
        print(f"  {group}: Mean = {grp_preds.mean():.2f}, Std = {grp_preds.std():.2f}")


Starting XGBoost analysis with k-fold cross-validation (XGBoost's own parallelism)

Processing dataset: FM

Starting analysis for dataset: FM
After adding median_lifespan_increase. Shape: (278, 1058)
After adding Mouse_ID. Shape: (278, 1059)
After filtering for selected groups. Shape: (94, 1059)
After selecting numeric predictors. Shape: (94, 1051)
Column names cleaned
After removing columns with NaN or Inf. Shape: (94, 1051)
Processing fold 1 of 10 (Iteration 0)
  Training XGBoost model for fold 1, iteration 0
  Making predictions for fold 1, iteration 0
Processing fold 2 of 10 (Iteration 0)
  Training XGBoost model for fold 2, iteration 0
  Making predictions for fold 2, iteration 0
Processing fold 3 of 10 (Iteration 0)
  Training XGBoost model for fold 3, iteration 0
  Making predictions for fold 3, iteration 0
Processing fold 4 of 10 (Iteration 0)
  Training XGBoost model for fold 4, iteration 0
  Making predictions for fold 4, iteration 0
Processing fold 5 of 10 (Iteration 0)
  Tr