In [1]:
#Load libraries
import sys
from pathlib import Path

#Set paths
working_dir="/Users/diane/Documents/2025/Fungi/github" #Working directory
original_data_path="/Users/diane/Documents/2025/Fungi/github/data/fungi_DRIAMS_AC_SMILE.csv" #Path to fungal data (DRIAMS) 
processed_data_path="/Users/diane/Documents/2025/Fungi/github/data/fungi_DRIAMS_AC_SMILE_processed.csv" #Path to fungal data (DRIAMS) 
nested_folds_path="/Users/diane/Documents/2025/Fungi/github/data/nested_folds.pkl" #Nested cross validation split
deployment_path="/Users/diane/Documents/2025/Fungi/github/data/data_deployement.csv" #Test data
external_validation="/Users/diane/Documents/2025/Fungi/github/data/MS_UMG_fungi_mergedAC_full_SMILE_preprocess.csv" #MS UMG data

#Load functions
sys.path.append(working_dir)
import functions

#Set result paths
working_dir = Path(working_dir)
data_dir=working_dir / "data"
intermediate_results_dir=working_dir / "intermediate_results"
plot_dir = working_dir / "plot"
results_directory=working_dir / "results"

#Create additional folders for results
dirs = functions.create_output_directories(working_dir)

In [None]:
#Preprocess data and create splits for training/test and nested-cross validation within the training set
functions.full_fungi_data_pipeline(
    file_path=original_data_path,
    processed_data_save_path=processed_data_path,
    plots_dir=plot_dir,
    nested_folds_save_path=nested_folds_path,
    final_data_dir=data_dir
)

In [None]:
#Run all preprocessing techniques and models mentionned in the paper (nested cross validation in Set 1)
#(Note: Can take a long time)
functions.run_nested_cv_pipeline(
    data_path=processed_data_path,
    nested_folds_path=nested_folds_path,
    results_dir=intermediate_results_dir,
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species', 'drug_class', 'year', 'pathogen_class'],
    preprocessing_options='default',
    model_grids='default',
    functions=functions
)

In [None]:
#Example with only one preprocessing technique and one ML model (nested cross validation in Set 1)
functions.run_nested_cv_pipeline(
    data_path=processed_data_path,
    nested_folds_path=nested_folds_path,
    results_dir=intermediate_results_dir,
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species', 'drug_class', 'year', 'pathogen_class'],
    preprocessing_options={
        'PCA': {
            'func': functions.pca_MS,
            'params': {'n_components': [0.95, 0.99]}
        }
    },
    model_grids={
        'Neural Network': {
            'model': functions.MLPClassifier(),
            'params': {
                'model__hidden_layer_sizes': [(50, 50)],
                'model__activation': ['relu', 'tanh'],
                'model__alpha': [0.0001],
                'model__max_iter': [1000]
            }
        }
    },
    functions=functions
)

In [None]:
#Summarize and plot results
functions.summarize_and_plot_results(
    intermediate_results_dir=intermediate_results_dir,
    results_dir=results_directory,
    plot_dir=plot_dir,
    functions=functions
)

In [None]:
#Run and compare models per drug, species, drug-species and unified
functions.run_and_evaluate_model_across_views(
    data_path=processed_data_path,
    nested_folds_path=nested_folds_path,
    best_model=functions.MLPClassifier(
        hidden_layer_sizes=(50, 50),
        max_iter=1000,
        activation='tanh',
        alpha=0.0001,
        random_state=42
    ),
    best_preproc_func=functions.pca_MS,
    best_preproc_params={'n_components': 0.99},
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species','drug_class', 'year', 'pathogen_class'],
    feature_removals_withoutDrugClass=['sample_id', 'dataset', 'response', 'drug', 'species','year', 'pathogen_class'],
    min_samples=0,
    results_directory=results_directory,
    functions=functions
)


In [None]:
#Plot the resutls
results = functions.analyze_species_drug_performance(
    results_directory=results_directory,
    functions=functions,
    mcc_threshold=0.4
)

In [None]:
#Biological markers
functions.generate_shap_explanations_for_selected_cases(
    data_path=processed_data_path,
    deployment_path=deployment_path,
    output_dir=results_directory,
    functions=functions,
    best_model=functions.MLPClassifier(
        hidden_layer_sizes=(50, 50),
        max_iter=1000,
        activation='tanh',
        alpha=0.0001,
        random_state=42
    ),
    best_preproc_func=functions.pca_MS,
    best_preproc_params={'n_components': 0.99},
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species', 'drug_class', 'year', 'pathogen_class'],
    species_drug_filter=[
        #("candida albicans", "Micafungin"),
        #("candida parapsilosis", "Fluconazole"),
        ("saccharomyces cerevisiae", "Itraconazole"),
        #("saccharomyces cerevisiae", "Fluconazole"),
        ("candida tropicalis", "Posaconazole")
    ],
    top_n_features=500
)