# Cross Vaccine markers

I'm going yo look at stable features.

In [4]:
import pandas as pd
import glob
import os

def aggregate_shap_features(folder_path, threshold=3):
    # List all CSV files that match your pattern in the folder
    files = glob.glob(os.path.join(folder_path, "*2_label_Consensus_*.csv"))
    if not files:
        print("No files found in", folder_path)
        return None

    # Read and concatenate all SHAP files
    dfs = [pd.read_csv(file) for file in files]
    all_shap = pd.concat(dfs, ignore_index=True)
    
    # Group by Model and Feature to compute frequency and mean importance
    robust_features = (
        all_shap.groupby(['Model', 'Feature'])
        .agg(frequency=('Feature', 'count'),
             mean_importance=('Importance', 'mean'))
        .reset_index()
    )
    
    # For each model, select features that appear in at least 'threshold' runs
    robust_dict = {}
    for model in robust_features['Model'].unique():
        model_features = robust_features[robust_features['Model'] == model]
        robust = model_features[model_features['frequency'] >= threshold]
        robust = robust.sort_values(by='mean_importance', ascending=False)
        robust_dict[model] = robust
        print(f"\nRobust features for {model} (appearing in at least {threshold} runs):")
        print(robust)
    
    return robust_dict

print("------Measles-------")
measles_folder = "../data_created/best_models/results/Measles/"
measles_robust = aggregate_shap_features(measles_folder, threshold=3)

print("-----Hepatitis------")
hepatitis_folder = "../data_created/best_models/results/Hepatitis/"
hepatitis_robust = aggregate_shap_features(hepatitis_folder, threshold=3)



------Measles-------

Robust features for Clonal Breadth (appearing in at least 3 runs):
            Model                Feature  frequency  mean_importance
0  Clonal Breadth  fraction_sequences_ab          5          0.39962
1  Clonal Breadth  fraction_sequences_gd          5          0.19460

Robust features for Clonal Depth (appearing in at least 3 runs):
          Model                    Feature  frequency  mean_importance
2  Clonal Depth  uniqueMoleculeFraction_ab          5          0.51852
3  Clonal Depth  uniqueMoleculeFraction_gd          5          0.42704

Robust features for Cytokines (appearing in at least 3 runs):
        Model      Feature  frequency  mean_importance
11  Cytokines  HHV6.Status          3         0.070933
8   Cytokines         GCSF          3         0.048467
15  Cytokines         IL-8          4         0.043550
16  Cytokines         IL-9          3         0.034733
17  Cytokines     IL12-p40          3         0.021700
13  Cytokines        IL-13      