In [7]:
#call it with this
#load deps
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from scipy import stats
import re
import time
from GenerateControlFuncs import *
import os


#write versions
import sys
import pkg_resources

# Get the Python version
python_version = sys.version

# Get the list of installed packages with versions
installed_packages = sorted(["{}=={}".format(d.project_name, d.version) for d in pkg_resources.working_set])

# Define output file
output_file = "../Paper_Figures/Package_Info/pythonStep2_environment.txt"

# Write to file
with open(output_file, "w") as f:
    f.write(f"Python Version:\n{python_version}\n\n")
    f.write("Installed Packages:\n")
    f.write("\n".join(installed_packages))

print(f"Python environment details saved to {output_file}")

Python environment details saved to ../Paper_Figures/Package_Info/pythonStep2_environment.txt


###Readme

This code will take multithreading, so the more cores you give it the faster it will train.
The outputs can be used in R Step 5 and 6

In [1]:
import os
import pandas as pd
from GenerateControlFuncs import *

# ========================
# 1) Load Data
# ========================
AM = pd.read_csv('../Step1_LoadingAndCleaningData/AM.csv')
FM = pd.read_csv('../Step1_LoadingAndCleaningData/FM.csv')
AP = pd.read_csv('../Step1_LoadingAndCleaningData/AP.csv')
FP = pd.read_csv('../Step1_LoadingAndCleaningData/FP.csv')
AMP = pd.read_csv('../Step1_LoadingAndCleaningData/AMP.csv')
FMP = pd.read_csv('../Step1_LoadingAndCleaningData/FMP.csv')

datasets = {
    'FM': FM,
    'AM': AM,
    'AP': AP,
    'FP': FP,
    'FMP': FMP,
    'AMP': AMP
}

# ========================
# 2) Define Best Params
# ========================
best_params = {
    'FM':  {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.6},
    'AM':  {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.6},
    'AP':  {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 1.0},
    'FP':  {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1.0},
    'FMP': {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.6},
    'AMP': {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
}

# ========================
# 3) Other Settings
# ========================
selected_groups = ["M_Rapa", "M_Aca", "M_Cana", "M_CR", "M_17aE2", "M_Cont_12"]
k_folds = 10
iters = 1   # for example, if you want 20 repeated CV iterations
output_prefix = "XGBoost_Analysis_10k_20i_details_fig1"
cols_to_ignore = [
    'median_lifespan_increase', 'Grp_Sex', 'Lifespan_Increased2', 'Grp', 
    'Mouse', 'ID', 'group', 'Treatment', 'X18198', 'Condition', 'Sex', 
    'Mouse_ID', 'Unnamed: 0'
]

# ========================
# 4) Run the Analysis
# ========================

results = perform_xgboost_analysis_kfold(
    datasets=datasets,
    selected_groups=selected_groups,
    k=10,
    output_prefix="results",
    cols_to_ignore=cols_to_ignore,
    hyperparam_dict=best_params,
    iters=20,
    save_intermediate=True,           # <--- 
    intermediate_dir="Intermediate"   # <--- optional custom folder
)

# ========================
# 5) Access the Results
# ========================
all_results = results['all_results']
control_results = results['control_results']
group_labels = results['group_labels']

print("\nSummary of results:")
print(f"All results shape: {all_results.shape}")
print(f"Control results shape: {control_results.shape}")

print("\nGroup labels:")
for dataset_name, labels in group_labels.items():
    print(f"{dataset_name}: {labels}")

# Optional: Additional post-analysis
for dataset_name in all_results['Dataset'].unique():
    print(f"\nDataset: {dataset_name}")
    dataset_res = all_results[all_results['Dataset'] == dataset_name]
    for group in dataset_res['Grp_Sex'].unique():
        grp_preds = dataset_res[dataset_res['Grp_Sex'] == group]['Median_Prediction']
        print(f"  {group}: Mean = {grp_preds.mean():.2f}, Std = {grp_preds.std():.2f}")


Starting XGBoost analysis with k-fold cross-validation (XGBoost's own parallelism)

Processing dataset: FM

Starting analysis for dataset: FM
After adding median_lifespan_increase. Shape: (278, 1058)
After adding Mouse_ID. Shape: (278, 1059)
After filtering for selected groups. Shape: (94, 1059)
After selecting numeric predictors. Shape: (94, 1051)
Column names cleaned
After removing columns with NaN or Inf. Shape: (94, 1051)
Processing fold 1 of 10 (Iteration 0)
  Training XGBoost model for fold 1, iteration 0
  Making predictions for fold 1, iteration 0
Processing fold 2 of 10 (Iteration 0)
  Training XGBoost model for fold 2, iteration 0
  Making predictions for fold 2, iteration 0
Processing fold 3 of 10 (Iteration 0)
  Training XGBoost model for fold 3, iteration 0
  Making predictions for fold 3, iteration 0
Processing fold 4 of 10 (Iteration 0)
  Training XGBoost model for fold 4, iteration 0
  Making predictions for fold 4, iteration 0
Processing fold 5 of 10 (Iteration 0)
  Tr

In [3]:
# in case you messed up where you saved the file, manually get concatinate it from Intermediate folder
import os
import glob
import pandas as pd
import numpy as np

# Set the intermediate folder where partial CSV files were saved
intermediate_dir = "Intermediate"

# Get all CSV files in the intermediate folder
csv_files = glob.glob(os.path.join(intermediate_dir, "*.csv"))
print(f"Found {len(csv_files)} intermediate CSV files.")

# Read each CSV file and store into a list of DataFrames
dfs = []
for file in csv_files:
    print(f"Reading file: {file}")
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all the intermediate DataFrames into one final DataFrame
if dfs:
    all_predictions = pd.concat(dfs, ignore_index=True)
else:
    print("No intermediate CSV files found.")
    all_predictions = pd.DataFrame()

if not all_predictions.empty:
    # 1. All Results:
    # Group by Dataset, Grp_Sex, and Iteration and calculate the median and standard deviation of Prediction.
    all_results = (
        all_predictions
        .groupby(['Dataset', 'Grp_Sex', 'Iteration'])
        .agg({'Prediction': ['median', 'std']})
        .reset_index()
    )
    # Flatten the MultiIndex columns
    all_results.columns = ['Dataset', 'Grp_Sex', 'Iteration', 'Median_Prediction', 'Std_Dev_Prediction']

    # 2. Control Results:
    # For the control group (Grp_Sex == 'M_Cont_12'), group by Dataset, Grp_Sex, Mouse_ID, Actual, and Iteration,
    # then take the median of Prediction.
    control_results = (
        all_predictions[all_predictions['Grp_Sex'] == 'M_Cont_12']
        .groupby(['Dataset', 'Grp_Sex', 'Mouse_ID', 'Actual', 'Iteration'])['Prediction']
        .median()
        .reset_index()
    )

    # 3. All Predictions All Groups:
    # This is simply the concatenated DataFrame "all_predictions".

    # Define output prefix and file names
    output_prefix = "XGBoost_Analysis_10k_20i_details_fig1"
    all_results_file = f"{output_prefix}_All_Results.xlsx"
    control_results_file = f"{output_prefix}_Control_Results.xlsx"
    all_predictions_file = f"{output_prefix}_All_Predictions_All_Groups.xlsx"

    # Save final documents to Excel
    all_results.to_excel(all_results_file, index=False)
    control_results.to_excel(control_results_file, index=False)
    all_predictions.to_excel(all_predictions_file, index=False)

    print("Final documents saved:")
    print(f" - {all_results_file}")
    print(f" - {control_results_file}")
    print(f" - {all_predictions_file}")
else:
    print("No prediction data to process.")


Found 120 intermediate CSV files.
Reading file: Intermediate/AM_iter_2.csv
Reading file: Intermediate/FM_iter_17.csv
Reading file: Intermediate/AP_iter_1.csv
Reading file: Intermediate/AM_iter_0.csv
Reading file: Intermediate/FM_iter_4.csv
Reading file: Intermediate/AM_iter_9.csv
Reading file: Intermediate/FMP_iter_2.csv
Reading file: Intermediate/AM_iter_19.csv
Reading file: Intermediate/FP_iter_14.csv
Reading file: Intermediate/FMP_iter_18.csv
Reading file: Intermediate/FMP_iter_11.csv
Reading file: Intermediate/FP_iter_3.csv
Reading file: Intermediate/FP_iter_11.csv
Reading file: Intermediate/AM_iter_18.csv
Reading file: Intermediate/AP_iter_5.csv
Reading file: Intermediate/AMP_iter_13.csv
Reading file: Intermediate/AP_iter_6.csv
Reading file: Intermediate/AMP_iter_4.csv
Reading file: Intermediate/FM_iter_1.csv
Reading file: Intermediate/AMP_iter_11.csv
Reading file: Intermediate/AP_iter_3.csv
Reading file: Intermediate/AMP_iter_19.csv
Reading file: Intermediate/AM_iter_4.csv
Readin

In [6]:
#write versions
import sys
import pkg_resources

# Get the Python version
python_version = sys.version

# Get the list of installed packages with versions
installed_packages = sorted(["{}=={}".format(d.project_name, d.version) for d in pkg_resources.working_set])

# Define output file
output_file = "../Paper_Figures/Package_Info/pythonStep2_environment.txt"

# Write to file
with open(output_file, "w") as f:
    f.write(f"Python Version:\n{python_version}\n\n")
    f.write("Installed Packages:\n")
    f.write("\n".join(installed_packages))

print(f"Python environment details saved to {output_file}")

Python environment details saved to ../Paper_Figures/Package_Info/pythonStep2_environment.txt
