In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
#import matplotlib.gridspec
import matplotlib.gridspec as gridspec
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
import numpy as np
from sklearn.svm import LinearSVC
from scanpy import read_h5ad
from sklearn.preprocessing import MinMaxScaler
import logging
import pandas as pd

print("Setup ML tracking packages")
import dagshub
import mlflow
import os


Setup ML tracking packages


In [2]:
class NaivelyCalibratedLinearSVC(LinearSVC):
    """LinearSVC with `predict_proba` method that naively scales
    `decision_function` output for binary classification."""

    def fit(self, X, y):
        super().fit(X, y)
        df = self.decision_function(X)
        self.df_min_ = df.min()
        self.df_max_ = df.max()

    def predict_proba(self, X):
        """Min-max scale output of `decision_function` to [0, 1]."""
        df = self.decision_function(X)
        calibrated_df = (df - self.df_min_) / (self.df_max_ - self.df_min_)
        proba_pos_class = np.clip(calibrated_df, 0, 1)
        proba_neg_class = 1 - proba_pos_class
        proba = np.c_[proba_neg_class, proba_pos_class]
        return proba
    
svc = NaivelyCalibratedLinearSVC(max_iter=10_000, dual="auto")
svc_isotonic = CalibratedClassifierCV(svc, cv=3, method="isotonic")
svc_sigmoid = CalibratedClassifierCV(svc, cv=3, method="sigmoid")

clf_list = [
    (svc, "SVC"),
    (svc_isotonic, "SVC + Isotonic"),
    (svc_sigmoid, "SVC + Sigmoid"),
]

In [3]:
#Loading in the reference dataset

# Used references:
# reference = "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/data/cma_meta_atlas.h5ad" #This is the object with first round of RFE; SVM_test_img
# reference = "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/SHAP_cpredictor_032/cma_meta_atlas.h5ad" #This is the object after RFE 1 needed for RFE 2; SVM_calibration_RFE1
# reference = "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/SHAP_cpredictor_032_100hvg/cma_meta_atlas.h5ad" # shap_only/ other option #This is the object after RFE 2 needed for RFE 3; SVM_calibration_RFE2_100hvg
# reference = "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/SHAP_cpredictor_032_50hvg/selected/cma_meta_atlas.h5ad" # shap_only/ other option #This is the object after RFE 3 needed for RFE 4; SVM_calibration_RFE2_50hvg
reference = "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/SHAP_cpredictor_032_100hvg/cma_meta_atlas.h5ad" # best option to keep all genes, calibration slightly better on 1000 iter instead of 2500
labels = "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/data/training_labels_meta.csv"
outdir_unit = "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/test_output_unit/"
colord_tsv= "/mnt/e/surfdrivesync/Radboud/data/python/jupyter_notebooks/data/colord.tsv"

In [4]:
class CpredictorClassifier():
    def __init__(self, Threshold_rej, rejected, OutputDir):
        self.scaler = MinMaxScaler()
        self.Classifier = LinearSVC(dual = False, random_state = 42, class_weight = 'balanced', max_iter = 1000) # was 2500
        self.threshold = Threshold_rej
        self.rejected = rejected
        self.output_dir = OutputDir
        self.expression_treshold = 162
        self.kf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)

    def expression_cutoff(self, Data, LabelsPath):
        logging.info(f'Selecting genes based on an summed expression threshold of minimally {self.expression_treshold} in each cluster')
        labels = pd.read_csv(LabelsPath,index_col=False)
        h5ad_object = Data.copy()
        cluster_id = 'labels'
        h5ad_object.obs[cluster_id] = labels.iloc[:, 0].tolist()
        res = pd.DataFrame(columns=h5ad_object.var_names.tolist(), index=h5ad_object.obs[cluster_id].astype("category").unique())
        
        ## Set up scanpy object based on expression treshold
        for clust in h5ad_object.obs[cluster_id].astype("category").unique():
            if h5ad_object.raw is not None:
                res.loc[clust] = h5ad_object[h5ad_object.obs[cluster_id].isin([clust]),:].raw.X.sum(0)
            else:
                res.loc[clust] = h5ad_object[h5ad_object.obs[cluster_id].isin([clust]),:].X.sum(0)
        res.loc["sum"]=np.sum(res,axis=0).tolist()
        res=res.transpose()
        res=res.loc[res['sum'] > self.expression_treshold]
        genes_expressed = res.index.tolist()
        logging.info("Amount of genes that remain: " + str(len(genes_expressed)))
        h5ad_object = h5ad_object[:, genes_expressed]
        Data = h5ad_object
        del res, h5ad_object

        return Data
        
    def preprocess_data_train(self, data_train):
        logging.info('Log normalizing the training data')
        np.log1p(data_train, out=data_train)
        logging.info('Scaling the training data')
        data_train = self.scaler.fit_transform(data_train)
        return data_train

    def preprocess_data_test(self, data_test):
        logging.info('Log normalizing the testing data')
        np.log1p(data_test, out=data_test)
        logging.info('Scaling the testing data')
        data_test = self.scaler.fit_transform(data_test)
        return data_test

    def fit_and_predict_svmrejection(self, labels_train, threshold, output_dir, data_train, data_test):
        self.rejected = True
        self.threshold = threshold
        self.output_dir = output_dir
        logging.info('Running SVMrejection')
        kf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
        clf = CalibratedClassifierCV(self.Classifier, cv=kf)
        clf.fit(data_train, labels_train.ravel())
        predicted = clf.predict(data_test)
        prob = np.max(clf.predict_proba(data_test), axis = 1)
        unlabeled = np.where(prob < self.threshold)

        # For unlabeled values from the SVMrejection put values of strings and integers
        try:
            predicted[unlabeled] = 'Unlabeled'
        except ValueError:
            unlabeled = list(unlabeled[0])
            predicted[unlabeled] = 999999
        self.predictions = predicted
        self.probabilities = prob
        self.save_results(self.rejected)

    def fit_and_predict_svm(self, labels_train, output_dir, data_train, data_test):
        self.rejected = False
        self.output_dir = output_dir
        logging.info('Running SVM')
        self.Classifier.fit(data_train, labels_train.ravel())
        self.predictions = self.Classifier.predict(data_test)
        self.save_results(self.rejected)

    def save_results(self, rejected):
        self.rejected = rejected
        self.predictions = pd.DataFrame(self.predictions)
        if self.rejected is True:
            self.probabilities = pd.DataFrame(self.probabilities)
            self.predictions.to_csv(f"{self.output_dir}/SVMrej_Pred_Labels.csv", index=False)
            self.probabilities.to_csv(f"{self.output_dir}/SVMrej_Prob.csv", index=False)
        else:
            self.predictions.to_csv(f"{self.output_dir}/SVM_Pred_Labels.csv", index=False)

# Child class for performance from the CpredictorClassifier class        
class CpredictorClassifierPerformance(CpredictorClassifier):
    def __init__(self, Threshold_rej, rejected, OutputDir):
        super().__init__(Threshold_rej, rejected, OutputDir)

    def fit_and_predict_svmrejection(self, labels_train, threshold, output_dir, data_train, data_test):
        # Calls the function from parent class and extends it for the child
        super().fit_and_predict_svmrejection(labels_train, threshold, output_dir, data_train, data_test)
        return self.predictions, self.probabilities

    def fit_and_predict_svm(self, labels_train, OutputDir, data_train, data_test):
        # Calls the function from parent class and extends it for the child
        super().fit_and_predict_svm(labels_train, OutputDir, data_train, data_test)
        return self.predictions

In [14]:
cpredictorperf = CpredictorClassifier(0.7,False,outdir_unit)
cpredictorperf.Classifier = LinearSVC(dual = False, random_state = 42, class_weight = 'balanced', C = 0.1)
print(str(getattr(cpredictorperf,"Classifier")))
print(getattr(cpredictorperf,"kf"))
print(np.logspace(-3, 3, 7))

LinearSVC(C=0.1, class_weight='balanced', dual=False, random_state=42)
StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
[1.e-03 1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03]


In [19]:
from sklearn.metrics import accuracy_score, roc_auc_score

def SVM_hyperparam(reference_H5AD, LabelsPath, OutputDir, rejected=True, Threshold_rej=0.7):
    '''
    Tests hyperparam optimization for "C" of SVM model 
    
    Parameters:
    reference_H5AD : H5AD file of datasets of interest.
    OutputDir : Output directory defining the path of the exported SVM_predictions.
    SVM_type: Type of SVM prediction, SVM or SVMrej (default).
    '''
    logging.basicConfig(level=logging.DEBUG, 
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d/%m/%Y %H:%M:%S',
                        filename='cPredictor_performance.log', filemode='w')

    # Using the child class of the CpredictorClassifier to process the data
    cpredictorperf = CpredictorClassifierPerformance(Threshold_rej, rejected, OutputDir)

    logging.info('Reading in the data')
    Data = read_h5ad(reference_H5AD)
        
    Data = cpredictorperf.expression_cutoff(Data, LabelsPath)

    data_train = pd.DataFrame.sparse.from_spmatrix(Data.X, index=list(Data.obs.index.values), columns=list(Data.var.index.values))
    data_train = data_train.to_numpy(dtype="float16")
        
    data_train = cpredictorperf.preprocess_data_train(data_train)
        
    # Do label encoding
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',') #, usecols = col
    label_encoder = LabelEncoder()
        
    y = label_encoder.fit_transform(labels.iloc[:,0].tolist())

    # Generate a dictionary to map values to strings
    res = dict(zip(label_encoder.inverse_transform(y),y))
    res['Unlabeled'] = 999999
    res = {v: k for k, v in res.items()}
    res

    # CV 3
    y_binaries = []
    for cls in range(len(np.unique(y))):
        y_binary = np.where(y == cls, 1, 0)
        y_binaries.append(y_binary)
    print(y_binaries)

    # Setting the C param in the classifier
    # Define the range of the hyperparameter
    # 'C' is the inverse of regularization strength (smaller values specify stronger regularization)
    C_range = np.logspace(-3, 3, 7)
    iter_options = [500, 1000, 1500, 2000, 2500]

    # Initialize variables to store the best score and hyperparameters
    best_score = 0
    best_hyperparams = None
    best_acc = 0
    best_hyperparams_acc = None

    for i, y_binary in enumerate(y_binaries):
        name_cond=''.join(list(label_encoder.inverse_transform([i])))
        print(name_cond)
        X_train, X_test, y_train, y_test = train_test_split(data_train, y_binaries[i], test_size=0.2, random_state=42)
        for _ in range(20):  
        # Randomly select hyperparameters from the defined range
            C_par = np.random.choice(C_range)
            numiter= np.random.choice(iter_options)
        
            cpredictorperf.Classifier = LinearSVC(dual = False, random_state = 42, class_weight = 'balanced', max_iter = numiter, C = C_par)

            # Setup cpredictors params from its class
            kf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
            cpredictormodel = getattr(cpredictorperf,"Classifier")
            svc_cpredictor = CalibratedClassifierCV(cpredictormodel, cv=kf)

            svc_cpredictor.fit(X_train, y_train)
            y_pred = svc_cpredictor.predict(X_test)
            # Calculate the accuracy and ROC-AUC
            accuracy = accuracy_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)

        # If this model's ROC-AUC is the best so far, store its score and hyperparameters
            if roc_auc > best_score:
                best_score = roc_auc
                best_hyperparams = {'C': C_par, 'max_iter': numiter}

            if roc_auc == best_score:
                print("Monte Carlo Method " + f'hyperparameter C: {C_par} and iter: {numiter} same as best {best_hyperparams} for roc_auc')

        # If this model's accuracy is the best so far, store its score and hyperparameters
            if accuracy > best_acc:
                best_acc = accuracy
                best_hyperparams_acc = {'C': C_par, 'max_iter': numiter}

            if accuracy == best_acc:
                print("Monte Carlo Method " + f'hyperparameter C: {C_par} and iter: {numiter} same as best {best_hyperparams_acc} for accuracy')

        # Print the best score and hyperparameters
        print("Monte Carlo Method " + f'Best ROC-AUC: {best_score}')
        print("Monte Carlo Method " + f'Best hyperparameters: {best_hyperparams}')
        print("Monte Carlo Method " + f'Best accuracy: {best_acc}')
        print("Monte Carlo Method " + f'Best hyperparameters: {best_hyperparams_acc}')

In [20]:
# 100 hvg + 100 shap
SVM_hyperparam(reference_H5AD=reference,LabelsPath=labels,OutputDir=outdir_unit, rejected=False)

[array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 1]), array([1, 1, 1, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 1, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0])]
CE
Monte Carlo Method hyperparameter C: 1.0 and iter: 500 same as best {'C': 1.0, 'max_iter': 500} for roc_auc
Monte Carlo Method hyperparameter C: 1.0 and iter: 500 same as best {'C': 1.0, 'max_iter': 500} for accuracy
Monte Carlo Method hyperparameter C: 0.01 and iter: 2500 same as best {'C': 0.01, 'max_iter': 2500} for roc_auc
Monte Carlo Method hyperparameter C: 0.01 and iter: 2500 same as best {'C': 0.01, 'max_iter': 2500} for accuracy
Monte Carlo Method hyperparameter C: 0.01 and iter: 2500 same as best {'C'



Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
LE




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
LESC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
LSC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
MC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Mel




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
SK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
TSK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Ves




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
Monte Carlo Method Best accuracy: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
nm-cSC




Monte Carlo Method hyperparameter C: 1.0 and iter: 2500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 1.0 and iter: 2500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 100.0 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 100.0 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.01 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.01 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.001 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.001 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy




Monte Carlo Method hyperparameter C: 1.0 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 1.0 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy




Monte Carlo Method hyperparameter C: 1.0 and iter: 2500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 1.0 and iter: 2500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.01 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.01 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.01 and iter: 500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.01 and iter: 500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.1 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.1 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 100.0 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for r



Monte Carlo Method hyperparameter C: 1.0 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 1.0 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 100.0 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 100.0 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.1 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.1 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy




Monte Carlo Method hyperparameter C: 1.0 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 1.0 and iter: 1500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 100.0 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 100.0 and iter: 1000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.001 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.001 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.01 and iter: 2500 same as best {'C': 0.01, 'max_iter': 1000} for roc_auc
Monte Carlo Method hyperparameter C: 0.01 and iter: 2500 same as best {'C': 0.01, 'max_iter': 1000} for accuracy
Monte Carlo Method hyperparameter C: 0.1 and iter: 2000 same as best {'C': 0.01, 'max_iter': 1000}

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

def SVM_hyperparam(reference_H5AD, LabelsPath, OutputDir, rejected=True, Threshold_rej=0.7):
    '''
    Tests hyperparam optimization for "C" of SVM model 
    
    Parameters:
    reference_H5AD : H5AD file of datasets of interest.
    OutputDir : Output directory defining the path of the exported SVM_predictions.
    SVM_type: Type of SVM prediction, SVM or SVMrej (default).
    '''
    logging.basicConfig(level=logging.DEBUG, 
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d/%m/%Y %H:%M:%S',
                        filename='cPredictor_performance.log', filemode='w')

    # Using the child class of the CpredictorClassifier to process the data
    cpredictorperf = CpredictorClassifierPerformance(Threshold_rej, rejected, OutputDir)

    logging.info('Reading in the data')
    Data = read_h5ad(reference_H5AD)
        
    Data = cpredictorperf.expression_cutoff(Data, LabelsPath)

    data_train = pd.DataFrame.sparse.from_spmatrix(Data.X, index=list(Data.obs.index.values), columns=list(Data.var.index.values))
    data_train = data_train.to_numpy(dtype="float16")
        
    data_train = cpredictorperf.preprocess_data_train(data_train)
        
    # Do label encoding
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',') #, usecols = col
    label_encoder = LabelEncoder()
        
    y = label_encoder.fit_transform(labels.iloc[:,0].tolist())

    # Generate a dictionary to map values to strings
    res = dict(zip(label_encoder.inverse_transform(y),y))
    res['Unlabeled'] = 999999
    res = {v: k for k, v in res.items()}
    res

    # CV 3
    y_binaries = []
    for cls in range(len(np.unique(y))):
        y_binary = np.where(y == cls, 1, 0)
        y_binaries.append(y_binary)
    print(y_binaries)

    # Setting the C param in the classifier
    # Define the range of the hyperparameter
    # 'C' is the inverse of regularization strength (smaller values specify stronger regularization)
    C_range = np.logspace(-3, 3, 7)

    # Initialize variables to store the best score and hyperparameters
    best_score = 0
    best_hyperparams = None
    best_acc = 0
    best_hyperparams_acc = None

    for i, y_binary in enumerate(y_binaries):
        name_cond=''.join(list(label_encoder.inverse_transform([i])))
        print(name_cond)
        X_train, X_test, y_train, y_test = train_test_split(data_train, y_binaries[i], test_size=0.2, random_state=42)
        for _ in range(20):  
        # Randomly select hyperparameters from the defined range
            C_par = np.random.choice(C_range)
        
            cpredictorperf.Classifier = LinearSVC(dual = False, random_state = 42, class_weight = 'balanced', max_iter = 2500, C = C_par)

            # Setup cpredictors params from its class
            kf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
            cpredictormodel = getattr(cpredictorperf,"Classifier")
            svc_cpredictor = CalibratedClassifierCV(cpredictormodel, cv=kf)

            svc_cpredictor.fit(X_train, y_train)
            y_pred = svc_cpredictor.predict(X_test)
            # Calculate the accuracy and ROC-AUC
            accuracy = accuracy_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)

        # If this model's ROC-AUC is the best so far, store its score and hyperparameters
            if roc_auc > best_score:
                best_score = roc_auc
                best_hyperparams = {'C': C_par}

        # If this model's accuracy is the best so far, store its score and hyperparameters
            if accuracy > best_acc:
                best_acc = accuracy
                best_hyperparams_acc = {'C': C_par}

        # Print the best score and hyperparameters
        print("Monte Carlo Method " + f'Best ROC-AUC: {best_score}')
        print("Monte Carlo Method " + f'Best hyperparameters: {best_hyperparams}')
        print("Monte Carlo Method " + f'Best accuracy: {best_acc}')
        print("Monte Carlo Method " + f'Best hyperparameters: {best_hyperparams_acc}')

In [29]:
# With original 2500 iterations 50 hvg + 50 shap
SVM_hyperparam(reference_H5AD=reference,LabelsPath=labels,OutputDir=outdir_unit, rejected=False)

[array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 1]), array([1, 1, 1, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 1, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0])]
CE
Monte Carlo Method Best ROC-AUC: 0.9471876443020104
Monte Carlo Method Best hyperparameters: {'C': 0.01}
CF
Monte Carlo Method Best ROC-AUC: 0.990385468220602
Monte Carlo Method Best hyperparameters: {'C': 0.1}
Cj
Monte Carlo Method Best ROC-AUC: 0.990385468220602
Monte Carlo Method Best hyperparameters: {'C': 0.1}
EC
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
IC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
LE
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
LESC
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
LSC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
MC
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
Mel
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
SK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
TSK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
Ves
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
nm-cSC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}
qSK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.01}


In [9]:
# With 1000 iterations
SVM_hyperparam(reference_H5AD=reference,LabelsPath=labels,OutputDir=outdir_unit, rejected=False)

[array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 1]), array([1, 1, 1, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 1, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0])]
CE
Monte Carlo Method Best ROC-AUC: 0.9471876443020104
Monte Carlo Method Best hyperparameters: {'C': 0.01}
CF
Monte Carlo Method Best ROC-AUC: 0.990385468220602
Monte Carlo Method Best hyperparameters: {'C': 0.1}
Cj
Monte Carlo Method Best ROC-AUC: 0.990385468220602
Monte Carlo Method Best hyperparameters: {'C': 0.1}
EC
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
IC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
LE
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
LESC
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
LSC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
MC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
Mel
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
SK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
TSK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
Ves
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
nm-cSC




Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}
qSK
Monte Carlo Method Best ROC-AUC: 1.0
Monte Carlo Method Best hyperparameters: {'C': 0.001}


In [30]:
# Compare to default: C = 1.0