In [1]:
#Load libraries
import sys
from pathlib import Path

#Set path
working_dir="/Users/diane/Documents/2025/Fungi/github" #Working directory

#Default paths
original_data_path = f"{working_dir}/data/fungi_DRIAMS_AC_SMILE.csv"  # Raw input dataset
processed_data_path = f"{working_dir}/data/fungi_DRIAMS_AC_SMILE_processed.csv"  # Preprocessed dataset
nested_folds_path = f"{working_dir}/data/nested_folds.pkl"  # File storing validation split
deployment_path = f"{working_dir}/data/data_deployement.csv"  # Test dataset for deployment

# Load user-defined functions from the 'functions.py' module in the project directory
sys.path.append(working_dir)
import functions

# Define additional paths for storing results and plots
working_dir = Path(working_dir)
data_dir=working_dir / "data"
intermediate_results_dir=working_dir / "intermediate_results"
plot_dir = working_dir / "plot"
results_directory=working_dir / "results"

# Create output directories
dirs = functions.create_output_directories(working_dir)

In [None]:
#Unzip and merge input data
functions.merge_fungi_data(
    work_dir=data_dir,
    albican_zip="data_albican.parquet.zip",
    non_albican_zip="data_non_albican.parquet.zip",
    maccs_file="maccs.csv",
    smiles_file="smiles.csv",
    output_maccs_csv="fungi_DRIAMS_AC_MACCS.csv",
    output_smiles_csv="fungi_DRIAMS_AC_SMILE.csv"
)

In [None]:
# Run the preprocessing pipeline on the original data
# Cleaning and splitting into train/test with nested cross-validation setup.
functions.full_fungi_data_pipeline(
    file_path=original_data_path,
    processed_data_save_path=processed_data_path,
    plots_dir=plot_dir,
    nested_folds_save_path=nested_folds_path,
    final_data_dir=data_dir
)

In [None]:
# Execute the model training pipeline using nested cross-validation.
# This step trains and evaluates all models mentionned in the paper using specified preprocessing options and hyperparameter grids.
# Results are saved in the intermediate results directory.
# Note: This process can take several days. See the next cell for a shorter example.
functions.run_nested_cv_pipeline(
    data_path=processed_data_path,
    nested_folds_path=nested_folds_path,
    results_dir=intermediate_results_dir,
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species', 'drug_class', 'year', 'pathogen_class'],
    preprocessing_options='default', # Use all default preprocessing steps
    model_grids='default',  # Use all default model settings and hyperparameter grids
    functions=functions # Pass the imported functions module
)

In [None]:
# Run nested cross-validation using a specific preprocessing technique (PCA) and a specific ML model (Neural Network)
functions.run_nested_cv_pipeline(
    data_path=processed_data_path,
    nested_folds_path=nested_folds_path,
    results_dir=intermediate_results_dir,
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species', 'drug_class', 'year', 'pathogen_class'], # Columns to exclude
    preprocessing_options={
        'PCA': {
            'func': functions.pca_MS, # PCA function
            'params': {'n_components': [0.95, 0.99]} # Retain 95% or 99% variance
        }
    },
    model_grids={
        'Neural Network': {
            'model': functions.MLPClassifier(), # Multi-layer perceptron
            'params': {
                'model__hidden_layer_sizes': [(50, 50)],
                'model__activation': ['relu', 'tanh'],
                'model__alpha': [0.0001],
                'model__max_iter': [1000]
            }
        }
    },
    functions=functions
)

In [None]:
# Summarize model performance metrics and generate plots
functions.summarize_and_plot_results(
    intermediate_results_dir=intermediate_results_dir,
    results_dir=results_directory,
    plot_dir=plot_dir,
    functions=functions
)

In [None]:
#Run and compare models per drug, species, drug-species, or leveraging all observations
functions.run_and_evaluate_model_across_views(
    data_path=processed_data_path,
    nested_folds_path=nested_folds_path,
    best_model=functions.MLPClassifier(
        hidden_layer_sizes=(50, 50),
        max_iter=1000,
        activation='tanh',
        alpha=0.0001,
        random_state=42
    ),
    best_preproc_func=functions.pca_MS,
    best_preproc_params={'n_components': 0.99},
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species','drug_class', 'year', 'pathogen_class'],
    feature_removals_withoutDrugClass=['sample_id', 'dataset', 'response', 'drug', 'species','year', 'pathogen_class'],
    min_samples=0,
    results_directory=results_directory,
    functions=functions
)


In [None]:
# Analyze model performance across species-drug combinations
# Generates plots and highlights cases above a threshold MCC score
results = functions.analyze_species_drug_performance(
    results_directory=results_directory,
    functions=functions,
    mcc_threshold=0.4
)

In [None]:
# Generate SHAP values for selected species-drug cases
functions.generate_shap_explanations_for_selected_cases(
    data_path=processed_data_path,
    deployment_path=deployment_path,
    output_dir=results_directory,
    functions=functions,
    best_model=functions.MLPClassifier(
        hidden_layer_sizes=(50, 50),
        max_iter=1000,
        activation='tanh',
        alpha=0.0001,
        random_state=42
    ),
    best_preproc_func=functions.pca_MS,
    best_preproc_params={'n_components': 0.99},
    feature_removals=['sample_id', 'dataset', 'response', 'drug', 'species', 'drug_class', 'year', 'pathogen_class'],
    species_drug_filter=[
        ("candida albicans", "Micafungin"),
        ("candida parapsilosis", "Fluconazole"),
        ("saccharomyces cerevisiae", "Itraconazole"),
        ("saccharomyces cerevisiae", "Fluconazole"),
        ("candida tropicalis", "Posaconazole")
    ],
    top_n_features=500
)