In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
morgan_dataset = pd.read_csv("~/datasets/morgan_dataset.csv")
ap_dataset = pd.read_csv("~/datasets/ap_dataset.csv")
rdk5_dataset = pd.read_csv("~/datasets/rdk5_dataset.csv")

In [15]:
# Ensure datasets are aligned (optional if already aligned)
morgan_dataset, ap_dataset = morgan_dataset.align(ap_dataset, join='inner', axis=0)
morgan_dataset, rdk5_dataset = morgan_dataset.align(rdk5_dataset, join='inner', axis=0)

# Split morgan_dataset into training and testing sets (80% train, 20% test)
morgan_train, morgan_test = train_test_split(morgan_dataset, test_size=0.2, random_state=42)

# Now split ap_dataset and rdk5_dataset based on the indices of the split
ap_train = ap_dataset.loc[morgan_train.index]  # Use the same indices from morgan_train
ap_test = ap_dataset.loc[morgan_test.index]  # Use the same indices from morgan_test

rdk5_train = rdk5_dataset.loc[morgan_train.index]  # Use the same indices from morgan_train
rdk5_test = rdk5_dataset.loc[morgan_test.index]  # Use the same indices from morgan_test

# Split based on 'activity' as the target
X_morgan_train_activity = morgan_train.drop(columns=['activity', 'pIC50'])
y_morgan_train_activity = morgan_train['activity']

X_morgan_test_activity = morgan_test.drop(columns=['activity', 'pIC50'])
y_morgan_test_activity = morgan_test['activity']

X_ap_train_activity = ap_train.drop(columns=['activity', 'pIC50'])
y_ap_train_activity = ap_train['activity']

X_ap_test_activity = ap_test.drop(columns=['activity', 'pIC50'])
y_ap_test_activity = ap_test['activity']

X_rdk5_train_activity = rdk5_train.drop(columns=['activity', 'pIC50'])
y_rdk5_train_activity = rdk5_train['activity']

X_rdk5_test_activity = rdk5_test.drop(columns=['activity', 'pIC50'])
y_rdk5_test_activity = rdk5_test['activity']

In [16]:
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from typing import Tuple

def run_classifiers(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    num_trials: int,
    save_dir: str
) -> None:
    """
    Runs multiple classifiers for a specified number of trials and saves multiple evaluation metrics.

    Metrics include:
    - Accuracy
    - Precision (macro)
    - Recall (macro)
    - F1-score (macro)
    - ROC AUC (if binary classification)

    Args:
        X_train (pd.DataFrame): Training feature set.
        X_test (pd.DataFrame): Testing feature set.
        y_train (pd.Series): Training labels.
        y_test (pd.Series): Testing labels.
        num_trials (int): Number of times to train each classifier.
        save_dir (str): Directory to save accuracy results.

    Raises:
        ValueError: If num_trials is less than 1.
        FileNotFoundError: If save_dir does not exist.
    """
    if num_trials < 1:
        raise ValueError("num_trials must be at least 1.")

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)  # Create directory if missing

    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "SVM": SVC(kernel="linear", random_state=42, probability=True),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "KNN": KNeighborsClassifier(),
    }

    results = []

    for trial in range(1, num_trials + 1):
        print(f"Running Trial {trial}...")
        
        for name, model in classifiers.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            metrics = {
                "Trial": trial,
                "Model": name,
                "Accuracy": accuracy_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
                "Recall": recall_score(y_test, y_pred, average="macro"),
                "F1-score": f1_score(y_test, y_pred, average="macro"),
            }

            # Compute ROC AUC only if binary classification
            if len(y_test.unique()) == 2 and hasattr(model, "predict_proba"):
                y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for positive class
                metrics["ROC AUC"] = roc_auc_score(y_test, y_prob)
            else:
                metrics["ROC AUC"] = None

            print(f"{name} Results (Trial {trial}): {metrics}")
            results.append(metrics)

    # Convert results to DataFrame and save
    results_df = pd.DataFrame(results)
    results_csv_path = os.path.join(save_dir, "classifier_metrics_results.csv")
    results_df.to_csv(results_csv_path, index=False)

    print(f"All trials completed. Results saved to {results_csv_path}.")


In [17]:
import os
os.makedirs(os.path.expanduser("~/datasets/control"), exist_ok=True)
os.makedirs(os.path.expanduser("~/datasets/control/morgan"), exist_ok=True)
os.makedirs(os.path.expanduser("~/datasets/control/ap"), exist_ok=True)
os.makedirs(os.path.expanduser("~/datasets/control/rdk5"), exist_ok=True)


In [18]:
run_classifiers(
    X_morgan_train_activity,
    X_morgan_test_activity,
    y_morgan_train_activity,
    y_morgan_test_activity,
    num_trials=40,
    save_dir="~/datasets/control/morgan"
)

Running Trial 1...
Training Logistic Regression...
Logistic Regression Results (Trial 1): {'Trial': 1, 'Model': 'Logistic Regression', 'Accuracy': 0.9170731707317074, 'Precision': 0.8210141813493769, 'Recall': 0.8108614232209738, 'F1-score': 0.8158131176999102, 'ROC AUC': 0.9453287557220141}
Training Random Forest...
Random Forest Results (Trial 1): {'Trial': 1, 'Model': 'Random Forest', 'Accuracy': 0.926829268292683, 'Precision': 0.8463888888888889, 'Recall': 0.8243341656263005, 'F1-score': 0.8348195100988397, 'ROC AUC': 0.9552642530170619}
Training SVM...
SVM Results (Trial 1): {'Trial': 1, 'Model': 'SVM', 'Accuracy': 0.9048780487804878, 'Precision': 0.7870197119359471, 'Recall': 0.8431127756970453, 'F1-score': 0.810945979498457, 'ROC AUC': 0.9316999583853517}
Training Gradient Boosting...
Gradient Boosting Results (Trial 1): {'Trial': 1, 'Model': 'Gradient Boosting', 'Accuracy': 0.9195121951219513, 'Precision': 0.8341539182931833, 'Recall': 0.7965563878485227, 'F1-score': 0.81368512

In [19]:
run_classifiers(
    X_ap_train_activity,
    X_ap_test_activity,
    y_ap_train_activity,
    y_ap_test_activity,
    num_trials=40,
    save_dir="~/datasets/control/ap"
)

Running Trial 1...
Training Logistic Regression...


Logistic Regression Results (Trial 1): {'Trial': 1, 'Model': 'Logistic Regression', 'Accuracy': 0.9146341463414634, 'Precision': 0.8144125574758205, 'Recall': 0.8094569288389513, 'F1-score': 0.8119044186076995, 'ROC AUC': 0.9386704119850189}
Training Random Forest...
Random Forest Results (Trial 1): {'Trial': 1, 'Model': 'Random Forest', 'Accuracy': 0.9195121951219513, 'Precision': 0.8378995433789954, 'Recall': 0.788701622971286, 'F1-score': 0.8104484512251502, 'ROC AUC': 0.9547440699126092}
Training SVM...
SVM Results (Trial 1): {'Trial': 1, 'Model': 'SVM', 'Accuracy': 0.9073170731707317, 'Precision': 0.794817789968652, 'Recall': 0.8130982105701208, 'F1-score': 0.8035209846650524, 'ROC AUC': 0.9147419891801913}
Training Gradient Boosting...
Gradient Boosting Results (Trial 1): {'Trial': 1, 'Model': 'Gradient Boosting', 'Accuracy': 0.9146341463414634, 'Precision': 0.8291299664153096, 'Recall': 0.7701831044527674, 'F1-score': 0.7953829262380402, 'ROC AUC': 0.9525593008739076}
Training K

In [None]:
run_classifiers(
    X_rdk5_train_activity,
    X_rdk5_test_activity,
    y_rdk5_train_activity,
    y_rdk5_test_activity,
    num_trials=40,
    save_dir="~/datasets/control/rdk5"
)