In [1]:
# Jupyter notebook in Jupytext format

## Feature Importance Analysis Notebook

This notebook performs comprehensive analysis of feature importance results from the consensus analysis, including:
- Jaccard stability similarity analysis comparing SHAP, MDI, and feature selection scores
- Convergence analysis with AUC calculation for tolerance drop curves
- SHAP signed values analysis for directional effects

## Initialisation

In [13]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find("project")
# slice the path from the index of 'project' to the end
project_path = path[: index_project + 7]
# set the working directory
os.chdir(project_path)
print(f"Project path set to: {os.getcwd()}")

Project path set to: c:\Github\ode-biomarker-project


In [14]:
from PathLoader import PathLoader  # noqa: E402

path_loader = PathLoader("data_config.env", "current_user.env")

In [15]:
from DataLink import DataLink  # noqa: E402

data_link = DataLink(path_loader, "data_codes.csv")

In [16]:
folder_name = "ThesisResult-FeatureImportanceConsensus"
exp_id = "v2_rf_k500_network_d3_split0.3"  # Without _importance_consensus suffix

# Create both the main folder and exp_id subfolder
main_folder = f"{path_loader.get_data_path()}data/results/{folder_name}"
exp_folder = f"{main_folder}/{exp_id}"

if not os.path.exists(main_folder):
    os.makedirs(main_folder)
if not os.path.exists(exp_folder):
    os.makedirs(exp_folder)

file_save_path = f"{exp_folder}/"



In [17]:
import pandas as pd
import numpy as np

clinical_genes_df = pd.read_excel("gene_to_uniprot_mapping.xlsx", sheet_name="Sheet1")

clinical_genes_df.head()

Unnamed: 0,Gene Symbol,UniProt ID,Protein Name
0,EGFR,P00533,Epidermal growth factor receptor (EC 2.7.10.1)...
1,MYC,P01106,Myc proto-oncogene protein (Class E basic heli...
2,TK1,P04183,"Thymidine kinase, cytosolic (EC 2.7.1.21)"
3,ERBB2,P04626,Receptor tyrosine-protein kinase erbB-2 (EC 2....
4,IGF1,P05019,Insulin-like growth factor 1 (Insulin-like gro...


In [18]:
# Load proteomics data 
print("## Data Loading and Preparation")
print("Loading proteomics data...")
loading_code = "goncalves-gdsc-2-Palbociclib-LN_IC50-sin"
proteomic_feature_data, proteomic_label_data = data_link.get_data_using_code(loading_code)

print(f"Proteomic feature data shape: {proteomic_feature_data.shape}")
print(f"Proteomic label data shape: {proteomic_label_data.shape}")

print("Preparing and aligning data...")
proteomic_feature_data = proteomic_feature_data.select_dtypes(include=[np.number])

# Align indices
common_indices = sorted(
    set(proteomic_feature_data.index) & set(proteomic_label_data.index)
)
feature_data = proteomic_feature_data.loc[common_indices]
label_data = proteomic_label_data.loc[common_indices]

print(f"Final aligned dataset shape: {feature_data.shape}")
print(f"Final aligned label shape: {label_data.shape}")

## Data Loading and Preparation
Loading proteomics data...
Proteomic feature data shape: (737, 6692)
Proteomic label data shape: (737,)
Preparing and aligning data...
Final aligned dataset shape: (737, 6692)
Final aligned label shape: (737,)


## Cross-Matching Clinical Genes with Proteomics Features

In [19]:
# Examine proteomics feature names to understand the format
print("\n## Examining Proteomics Feature Names")
print("Sample feature names:")
for i, col in enumerate(feature_data.columns[:10]):
    print(f"{i+1}. {col}")

print(f"\nTotal number of proteomics features: {len(feature_data.columns)}")

# Extract Uniprot IDs from proteomics feature names
def extract_uniprot_id(feature_name):
    """
    Extract Uniprot ID from proteomics feature name format: [Gene][UniprotID]:HUMAN
    Example: 'EGFRP00533:HUMAN' -> 'P00533'
    """
    # Look for pattern: any characters followed by P followed by 5 digits
    import re
    match = re.search(r'([A-Z][0-9]{5})', feature_name)
    if match:
        return match.group(1)
    return None

# Extract Uniprot IDs from all feature names
feature_uniprot_ids = {}
for col in feature_data.columns:
    uniprot_id = extract_uniprot_id(col)
    if uniprot_id:
        feature_uniprot_ids[col] = uniprot_id

print(f"\nNumber of features with extractable Uniprot IDs: {len(feature_uniprot_ids)}")

# Get clinical gene Uniprot IDs
clinical_uniprot_ids = set(clinical_genes_df['UniProt ID'].tolist())
print(f"Number of clinical genes: {len(clinical_uniprot_ids)}")

# Find overlapping Uniprot IDs
overlapping_ids = clinical_uniprot_ids.intersection(set(feature_uniprot_ids.values()))
print(f"Number of overlapping Uniprot IDs: {len(overlapping_ids)}")

# Create mapping of clinical genes to proteomics features
clinical_to_proteomics = {}
for clinical_id in overlapping_ids:
    matching_features = [feature for feature, uniprot_id in feature_uniprot_ids.items() 
                        if uniprot_id == clinical_id]
    clinical_to_proteomics[clinical_id] = matching_features

print("\n## Matching Results")
print(f"Clinical genes found in proteomics dataset: {len(overlapping_ids)}")
print("Matching Uniprot IDs:")
for uniprot_id in sorted(overlapping_ids):
    gene_symbol = clinical_genes_df[clinical_genes_df['UniProt ID'] == uniprot_id]['Gene Symbol'].iloc[0]
    matching_features = clinical_to_proteomics[uniprot_id]
    print(f"  {gene_symbol} ({uniprot_id}): {len(matching_features)} feature(s)")
    for feature in matching_features:
        print(f"    - {feature}")

# Create filtered dataset with only clinically relevant features
clinical_feature_columns = []
for features_list in clinical_to_proteomics.values():
    clinical_feature_columns.extend(features_list)

clinical_feature_data = feature_data[clinical_feature_columns]
print(f"\nFiltered dataset shape (clinical features only): {clinical_feature_data.shape}")


## Examining Proteomics Feature Names
Sample feature names:
1. P37108;SRP14_HUMAN
2. Q96JP5;ZFP91_HUMAN
3. Q9Y4H2;IRS2_HUMAN
4. P36578;RL4_HUMAN
5. Q6SPF0;SAMD1_HUMAN
6. O76031;CLPX_HUMAN
7. Q8WUQ7;CATIN_HUMAN
8. A6NIH7;U119B_HUMAN
9. Q9BTD8;RBM42_HUMAN
10. Q9P258;RCC2_HUMAN

Total number of proteomics features: 6692

Number of features with extractable Uniprot IDs: 3525
Number of clinical genes: 26
Number of overlapping Uniprot IDs: 16

## Matching Results
Clinical genes found in proteomics dataset: 16
Matching Uniprot IDs:
  EGFR (P00533): 1 feature(s)
    - P00533;EGFR_HUMAN
  TK1 (P04183): 1 feature(s)
    - P04183;KITH_HUMAN
  ERBB2 (P04626): 1 feature(s)
    - P04626;ERBB2_HUMAN
  RB1 (P06400): 1 feature(s)
    - P06400;RB_HUMAN
  MET (P08581): 1 feature(s)
    - P08581;MET_HUMAN
  AR (P10275): 1 feature(s)
    - P10275;ANDR_HUMAN
  CDK4 (P11802): 1 feature(s)
    - P11802;CDK4_HUMAN
  FGFR2 (P21802): 1 feature(s)
    - P21802;FGFR2_HUMAN
  CDK2 (P24941): 1 feature(s)
    - P249