# SVM and SVM rejection 



In [1]:
import os 
import numpy as np
import pandas as pd
import scanpy as sc
import time as tm
import seaborn as sns
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from scanpy import read_h5ad

#os.chdir("")

## Defining the SVM and SVMrej functions:



In [2]:
def run_SVM(matrix_train, matrix_test, LabelsPathTrain, OutputDir):
    '''
    run baseline classifier: SVM
    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters:
    matrix_train, matrix_test : Data matrix for training and testing data,
        cells-genes matrix with cell unique barcodes as row names and gene names as column names.
    LabelsPathTrain : Cell population annotations file path matching the training data (.csv).
    OutputDir : Output directory defining the path of the exported file.
    '''


    # read the data
    data_train = matrix_train
    data_test = matrix_test
    labels_train = pd.read_csv(LabelsPathTrain, header=0,index_col=None, sep=',')
        

    # normalise data
    data_train = np.log1p(data_train)
    data_test = np.log1p(data_test)  
        
    Classifier = LinearSVC()
    pred = []
    Classifier.fit(data_train, labels_train.values.ravel())

    predicted = Classifier.predict(data_test)
            
    pred.extend(predicted)
    pred = pd.DataFrame(pred)
    pred.to_csv(str(OutputDir) + "SVM_Pred_Labels.csv", index =False)
    
    
    

In [3]:
def run_SVMrej(matrix_train, matrix_test, LabelsPathTrain, OutputDir, Threshold = 0.7):
    '''
    run baseline classifier: SVM
    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    matrix_train, matrix_test : Data matrix for training and testing data,
        cells-genes matrix with cell unique barcodes as row names and gene names as column names.
    LabelsPathTrain : Cell population annotations file path matching the training data (.csv).
    OutputDir : Output directory defining the path of the exported file.
    Threshold : Threshold used when rejecting the cells, default is 0.7.
    '''


    # read the data
    data_train = matrix_train
    data_test = matrix_test
    labels_train = pd.read_csv(LabelsPathTrain, header=0,index_col=None, sep=',')
    
    # Set threshold for rejecting cells
    Threshold = 0.7

    # normalise data
    data_train = np.log1p(data_train)
    data_test = np.log1p(data_test) 
        
    Classifier = LinearSVC()
    clf = CalibratedClassifierCV(Classifier, cv=3)

    pred = []
    probability = [] 
    
    clf.fit(data_train, labels_train.values.ravel())
    predicted = clf.predict(data_test)
    prob = np.max(clf.predict_proba(data_test), axis = 1)
    unlabeled = np.where(prob < Threshold)
    predicted[unlabeled] = 'Unknown'
        
    pred.extend(predicted)
    probability.extend(prob)
    
    pred = pd.DataFrame(pred)
    probability = pd.DataFrame(probability)
    
    pred.to_csv(str(OutputDir) + "SVMrej_Pred_Labels.csv", index = False)
    probability.to_csv(str(OutputDir) + "SVMrej_Prob.csv", index = False)
    

## Annotate:

Annotate Jos data from meta atlas

In [4]:
training = read_h5ad("test_dat_meta.h5ad")
testing = read_h5ad("kera.h5ad")

In [6]:
# load data:
# training data
matrix_train = pd.DataFrame.sparse.from_spmatrix(training.X, index=list(training.obs.index.values), columns=list(training.var.index.values))

# testing data
matrix_test = pd.DataFrame.sparse.from_spmatrix(testing.X, index=list(testing.obs.index.values), columns=list(testing.var.index.values))

In [7]:
# subselect the train matrix for values that are present in both
df_all = training.var[["features"]].merge(testing.var[["features"]].drop_duplicates(), on=['features'], 
                   how='left', indicator=True)
df_all
df_all['_merge'] == 'left_only'
training1 = df_all[df_all['_merge'] == 'both']
col_one_list = training1['features'].tolist()

matrix_test = matrix_test[matrix_test.columns.intersection(col_one_list)]
matrix_train = matrix_train[matrix_train.columns.intersection(col_one_list)]
matrix_train = matrix_train[list(matrix_test.columns)]

In [8]:
# Convert the ordered dataframes back to nparrays
matrix_train2 = matrix_train.to_numpy()
matrix_test2 = matrix_test.to_numpy()

In [9]:
# run functions:
run_SVM(matrix_train=matrix_train2,matrix_test=matrix_test2,
        LabelsPathTrain = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/predictions/20220328/training_labels_meta.csv",
        OutputDir = "predictions_kera/")

In [10]:
run_SVMrej(matrix_train=matrix_train2,matrix_test=matrix_test2,
        LabelsPathTrain = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/predictions/20220328/training_labels_meta.csv",
        OutputDir = "predictions_kera/")