# Cross Vaccine markers

I'm going yo look at stable features.

In [79]:
import pandas as pd
import glob
import os
import json

from PIL.features import features

In [80]:
def aggregate_shap_features(folder_path, threshold=3):
    # List all CSV files that match your pattern in the folder
    files = glob.glob(os.path.join(folder_path, "*2_label_Consensus_*_.csv")) # "*SMOTE_*_TEST.csv"
    print(files)
    if not files:
        print("No files found in", folder_path)
        return None

    # Read and concatenate all SHAP files
    dfs = [pd.read_csv(file) for file in files]
    all_shap = pd.concat(dfs, ignore_index=True)
    
    # Group by Model and Feature to compute frequency and mean importance
    robust_features = (
        all_shap.groupby(['Model', 'Feature'])
        .agg(frequency=('Feature', 'count'),
             mean_importance=('Importance', 'mean'))
        .reset_index()
    )
    
    robust_dict = {}
    for model in robust_features['Model'].unique():
        model_features = robust_features[robust_features['Model'] == model]
        robust = model_features[model_features['frequency'] >= threshold]
        robust = robust.sort_values(by='mean_importance', ascending=False)
        robust_dict[model] = robust
        print(f"\nRobust features for {model} (appearing in at least {threshold} runs):")
        print(robust)
    
    def process_files(file_list, cols):
        dfs = []
        for file in file_list:
            df = pd.read_csv(file)
            # Ensure that "Model" and all required columns exist in the file
            if "Data" in df.columns and all(col in df.columns for col in cols):
                dfs.append(df[["Data"] + cols])
        if dfs:
            combined = pd.concat(dfs, ignore_index=True)
            # Group by Model and compute the mean for the specified columns
            return combined.groupby("Data")[cols].mean().reset_index()
        else:
            return pd.DataFrame()
    
    # Define the folder path where your Measles result files are stored
    folder_path = "../data_created/best_models/results/Measles/"
    
    # List all CSV files matching your pattern in the folder
    all_files = glob.glob(os.path.join(folder_path, "*2_label_Consensus_*__.csv"))
    
    # Separate files into train and test based on the filename
    train_files = [file for file in all_files if "train" in os.path.basename(file).lower()]
    test_files  = [file for file in all_files if "test" in os.path.basename(file).lower()]
    
    # Aggregate average metrics per model for train and test separately using dynamic column lists.
    avg_train_metrics = process_files(train_files, ["TRAIN Accuracy", "TRAIN Balanced_Accuracy"])
    avg_test_metrics  = process_files(test_files, ["Accuracy"])
    
    print("\nAverage Train Metrics per Model:")
    print(avg_train_metrics)
    
    print("\nAverage Test Metrics per Model:")
    print(avg_test_metrics)
        
    return robust_dict

print("------Measles-------")
measles_folder = "../data_created/best_models/results/Measles/"
measles_robust = aggregate_shap_features(measles_folder, threshold=3)

print("-----Hepatitis------")
hepatitis_folder = "../data_created/best_models/results/Hepatitis/"
hepatitis_robust = aggregate_shap_features(hepatitis_folder, threshold=3)

------Measles-------
['../data_created/best_models/results/Measles/shap_feature_2_label_Consensus_compressed_4__.csv', '../data_created/best_models/results/Measles/train_2_label_Consensus_compressed_4__.csv', '../data_created/best_models/results/Measles/test_2_label_Consensus_compressed_4__.csv', '../data_created/best_models/results/Measles/shap_feature_2_label_Consensus_compressed_2__.csv', '../data_created/best_models/results/Measles/test_2_label_Consensus_compressed_2__.csv', '../data_created/best_models/results/Measles/train_2_label_Consensus_compressed_2__.csv', '../data_created/best_models/results/Measles/test_2_label_Consensus_compressed_5__.csv', '../data_created/best_models/results/Measles/train_2_label_Consensus_compressed_5__.csv', '../data_created/best_models/results/Measles/shap_feature_2_label_Consensus_compressed_5__.csv', '../data_created/best_models/results/Measles/train_2_label_Consensus_compressed_3__.csv', '../data_created/best_models/results/Measles/test_2_label_Co

In [81]:
def pp(df):
    features = []
    for feature in df['RNA Data']['Feature']:
        features.append(feature)
        
    return features

measles_rna_features = set(pp(measles_robust))
print(measles_rna_features)

hepatitis_rna_features = set(pp(hepatitis_robust))
print(hepatitis_rna_features)

common_global = measles_rna_features.intersection(hepatitis_rna_features)
print("Global common features between Measles and Hepatitis:")
print(common_global)

{'cluster26_Compressed', 'cluster12_Compressed', 'cluster27_Compressed', 'cluster23_Compressed', 'cluster33_Compressed', 'cluster30_Compressed', 'cluster25_Compressed', 'cluster8_Compressed'}
{'cluster10_Compressed', 'cluster28_Compressed', 'cluster27_Compressed', 'cluster33_Compressed', 'cluster19_Compressed', 'cluster32_Compressed', 'cluster20_Compressed'}
Global common features between Measles and Hepatitis:
{'cluster27_Compressed', 'cluster33_Compressed'}


In [82]:
# Load the JSON file
def load_json(json_path):
    with open(json_path, 'r') as file:
        cluster_data = json.load(file)
    return cluster_data

def decompress_features(df, json_path):
    cluster_data = load_json(json_path)
    features = []
    for feature in df['RNA Data']['Feature']:
        # Remove the "_compressed" substring from the feature name, if present
        feature_key = feature.replace("_Compressed", "")
        # Retrieve the decompressed features from the JSON data
        decompressed_features = cluster_data.get(feature_key)
        if decompressed_features is None:
            print(f"Warning: Feature '{feature_key}' not found in cluster data.")
        else:
            features.extend(decompressed_features)
    return features

measles_rna_features = set(decompress_features(measles_robust, "../data/Measles/clusters/RNA1.json"))
print(measles_rna_features)

hepatitis_rna_features = set(decompress_features(hepatitis_robust, "../data/Hepatitis B/clusters/RNA1.json"))
print(hepatitis_rna_features)

common_global = measles_rna_features.intersection(hepatitis_rna_features)
print("Global common features between Measles and Hepatitis:")
print(common_global)

{'Inflammation.10', 'Inflammation', 'Cell cycle.5', 'Inflammation.5', 'Protein synthesis.17', 'Plasma cells.4', 'Cytotoxic lymphocytes', 'Leukocyte activation', 'Monocytes.4', 'Erythroid cells.14', 'Plasma cells.3', 'Monocytes.3', 'Neutrophils', 'Gene transcription.15', 'Monocytes', 'Inflammation.6', 'Inflammation.7', 'Cytokines/chemokines.2', 'Inflammation.12', 'Platelet', 'Neutrophils.2'}
{'Gene transcription.12', 'Erythroid cells.19', 'Lymphocytes', 'Protein modification.17', 'TGF-beta', 'Cell cycle.9', 'Protein synthesis.12', 'Monocytes.1', 'Gene transcription.1', 'Protein synthesis.8', 'Platelet/Prostaglandin', 'Monocytes.4', 'Complement', 'Protein modification.2', 'Monocytes.6', 'Gene transcription.3', 'Erythroid cells.18', 'Neutrophils.1', 'Gene transcription.19', 'Protein modification.10', 'Cell cycle.7', 'Protein modification.3', 'Oxidative phosphorylation.1', 'Oxidative stress.2', 'Protein modification', 'Protein synthesis.13', 'Gene transcription.11', 'Protein synthesis.4', 