In [9]:
import os
import pandas as pd
from typing import List, Dict, Tuple

# Directory Setup

In [10]:
base_dir = '/Users/Avi/Dissertation'

results_dirs = {
    "Curated_RF_Classification_Results": os.path.join(base_dir, 'Results/Model_Performance/Curated/Random_Forest/Classification'),
    "Curated_RF_Regression_Results": os.path.join(base_dir, 'Results/Model_Performance/Curated/Random_Forest/Regression'),
    "Curated_ChemProp_Classification_Results": os.path.join(base_dir, 'Results/Model_Performance/Curated/ChemProp/Classification'),
    "Curated_ChemProp_Regression_Results": os.path.join(base_dir, 'Results/Model_Performance/Curated/ChemProp/Regression'),
    "Non_Curated_RF_Classification_Results": os.path.join(base_dir, 'Results/Model_Performance/Non_Curated/Random_Forest/Classification'),
    "Non_Curated_RF_Regression_Results": os.path.join(base_dir, 'Results/Model_Performance/Non_Curated/Random_Forest/Regression'),
    "Non_Curated_ChemProp_Classification_Results": os.path.join(base_dir, 'Results/Model_Performance/Non_Curated/ChemProp/Classification'),
    "Non_Curated_ChemProp_Regression_Results": os.path.join(base_dir, 'Results/Model_Performance/Non_Curated/ChemProp/Regression')
}

In [11]:
metrics = {
    'Classification': ['accuracy', 'precision', 'recall', 'f1', 'auroc', 'auprc'],
    'Regression': ['mse', 'rmse', 'r2', 'mae']
}

fingerprint_types = ['ECFP', 'MACCS KEYS', 'AP2', 'AP3', 'AP2+AP3']

# Defining Functions

In [12]:
def read_csv_files(directory: str) -> pd.DataFrame:
    df = pd.concat([pd.read_csv(os.path.join(directory, f)) for f in os.listdir(directory) if f.endswith('.csv')])
    df = df[~df['split'].str.lower().str.contains('train')]
    return df

def aggregate_metrics(df: pd.DataFrame, metrics: List[str]) -> Dict[str, Dict[str, float]]:
    df.columns = df.columns.str.lower()
    return {metric: {"mean": df[metric].mean(), "std": df[metric].std()} 
            for metric in metrics if metric in df.columns}

def save_results(results: pd.DataFrame, output_dir: str, file_name: str):
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"{file_name}.csv")
    results.to_csv(output_file)
    print(f"Results saved to {output_file}")

# RQ1: Curated Vs. Non-Curated

In [13]:
def process_rq1(output_dir: str):
    for task in ['Classification', 'Regression']:
        for data_type in ['Curated', 'Non_Curated']:
            dfs = []
            for model in ['RF', 'ChemProp']:
                df = read_csv_files(results_dirs[f"{data_type}_{model}_{task}_Results"])
                dfs.append(df)
            
            combined_df = pd.concat(dfs)
            results = aggregate_metrics(combined_df, metrics[task])
            
            save_results(pd.DataFrame(results).T, output_dir, f"RQ1_{data_type}_{task}")

# Execute RQ1
rq1_output_dir = os.path.join(base_dir, 'Results/Performance_Comparison/RQ1_Curated_vs_Non_Curated')
process_rq1(rq1_output_dir)

Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ1_Curated_vs_Non_Curated/RQ1_Curated_Classification.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ1_Curated_vs_Non_Curated/RQ1_Non_Curated_Classification.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ1_Curated_vs_Non_Curated/RQ1_Curated_Regression.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ1_Curated_vs_Non_Curated/RQ1_Non_Curated_Regression.csv


# RQ2: Scaffold Split Vs. Random Split

In [14]:
def process_rq2(output_dir: str):
    for task in ['Classification', 'Regression']:
        dfs = [read_csv_files(results_dirs[f"{data_type}_{model}_{task}_Results"]) 
               for data_type in ['Curated', 'Non_Curated'] 
               for model in ['RF', 'ChemProp']]
        
        df = pd.concat(dfs)
        
        results = {}
        for split_type in ['Scaffold', 'Random']:
            split_df = df[df['split'].str.lower() == split_type.lower()]
            results[split_type] = aggregate_metrics(split_df, metrics[task])
        
        for split_type, split_metrics in results.items():
            save_results(pd.DataFrame(split_metrics).T, output_dir, f"RQ2_{task}_{split_type}")

# Execute RQ2
rq2_output_dir = os.path.join(base_dir, 'Results/Performance_Comparison/RQ2_Scaffold_vs_Random_Split')
process_rq2(rq2_output_dir)

Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ2_Scaffold_vs_Random_Split/RQ2_Classification_Scaffold.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ2_Scaffold_vs_Random_Split/RQ2_Classification_Random.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ2_Scaffold_vs_Random_Split/RQ2_Regression_Scaffold.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ2_Scaffold_vs_Random_Split/RQ2_Regression_Random.csv


# RQ3: Comparing Fingerprint Descriptor Performance

In [15]:
def process_rq3(output_dir: str):
    for task in ['Classification', 'Regression']:
        dfs = [read_csv_files(results_dirs[f"{data_type}_RF_{task}_Results"]) 
               for data_type in ['Curated', 'Non_Curated']]
        df = pd.concat(dfs)
        
        results = {}
        for fp_type in fingerprint_types:
            fp_df = df[df['fingerprint'].str.lower() == fp_type.lower()]
            if not fp_df.empty:
                results[fp_type] = aggregate_metrics(fp_df, metrics[task])
        
        for fp_type, fp_metrics in results.items():
            file_name = f"RQ3_RF_{task}_{fp_type.replace(' ', '_')}"
            save_results(pd.DataFrame(fp_metrics).T, output_dir, file_name)

# Execute RQ3
rq3_output_dir = os.path.join(base_dir, 'Results/Performance_Comparison/RQ3_Fingerprint_Performance')
process_rq3(rq3_output_dir)

Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fingerprint_Performance/RQ3_RF_Classification_ECFP.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fingerprint_Performance/RQ3_RF_Classification_MACCS_KEYS.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fingerprint_Performance/RQ3_RF_Classification_AP2.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fingerprint_Performance/RQ3_RF_Classification_AP3.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fingerprint_Performance/RQ3_RF_Classification_AP2+AP3.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fingerprint_Performance/RQ3_RF_Regression_ECFP.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fingerprint_Performance/RQ3_RF_Regression_MACCS_KEYS.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ3_Fi

# RQ4: RF Vs. ChemProp

In [16]:
def process_rq4(output_dir: str):
    for task in ['Classification', 'Regression']:
        for model in ['RF', 'ChemProp']:
            dfs = [read_csv_files(results_dirs[f"{data_type}_{model}_{task}_Results"]) 
                   for data_type in ['Curated', 'Non_Curated']]
            
            combined_df = pd.concat(dfs)
            results = aggregate_metrics(combined_df, metrics[task])
            
            results_df = pd.DataFrame({
                'Mean': pd.DataFrame(results).T['mean'],
                'Std': pd.DataFrame(results).T['std']
            })
            
            model_name = 'Random_Forest' if model == 'RF' else 'ChemProp'
            file_name = f"RQ4_{model_name}_{task}"
            save_results(results_df, output_dir, file_name)

# Execute RQ4
rq4_output_dir = os.path.join(base_dir, 'Results/Performance_Comparison/RQ4_RF_vs_ChemProp')
process_rq4(rq4_output_dir)

Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ4_RF_vs_ChemProp/RQ4_Random_Forest_Classification.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ4_RF_vs_ChemProp/RQ4_ChemProp_Classification.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ4_RF_vs_ChemProp/RQ4_Random_Forest_Regression.csv
Results saved to /Users/Avi/Dissertation/Results/Performance_Comparison/RQ4_RF_vs_ChemProp/RQ4_ChemProp_Regression.csv
