# SVM classification of keratinocyte subpopulations - Rojahn biopsy data



In [15]:
import os 
import numpy as np
import pandas as pd
import scanpy as sc
import time as tm
import seaborn as sns
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
import anndata as ad

%matplotlib inline 
sc.set_figure_params(dpi=800)

os.chdir("/path/to/directory/") #adjust path

## Defining the SVM function:
Note: SVM function adapted from Abdelaal et al. (2019)


In [16]:
def run_SVM(matrix_train, matrix_test, LabelsPathTrain, OutputDir):
    '''
    run baseline classifier: SVM
    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters:
    matrix_train, matrix_test : Data matrix for training and testing data,
        cells-genes matrix with cell unique barcodes as row names and gene names as column names.
    LabelsPathTrain : Cell population annotations file path matching the training data (.csv).
    OutputDir : Output directory defining the path of the exported file.
    '''


    # read the data
    data_train = matrix_train
    data_test = matrix_test
    labels_train = pd.read_csv(LabelsPathTrain, header=0,index_col=None, sep=',')
        

    # normalise data
    data_train = np.log1p(data_train)
    data_test = np.log1p(data_test)  
        
    Classifier = LinearSVC()
    pred = []
    Classifier.fit(data_train, labels_train.values.ravel())

    predicted = Classifier.predict(data_test)
            
    pred.extend(predicted)
    pred = pd.DataFrame(pred)
    pred.to_csv(str(OutputDir) + "SVM_Pred_Labels.csv", index =False)

## Apply SVM to the Rojahn biopsy data

In [17]:
# load data:
# Wang et al 2019 training data
mtx_train = sc.read_mtx("./Data/Wang_kc2_count.mtx") #adjust path if needed
cells_train = pd.read_table("./Data/Wang_kc2_metadata.tsv") #adjust path if needed
genes_train = pd.read_table("./Data/Wang_kc2_features.tsv", header=None, index_col=0) #adjust path if needed
genes_train.index.name = None
training = ad.AnnData(
    X=mtx_train.X.T,
    obs=cells_train,
    var=genes_train,
)

matrix_train = pd.DataFrame.sparse.from_spmatrix(training.X, index=list(training.obs.index.values), columns=list(training.var.index.values))

# Rojahn et al. 2020 testing data
mtx_test = sc.read_mtx("./Data/Rojahn_kcfiltered2_count.mtx") #adjust path if needed
cells_test = pd.read_table("./Data/Rojahn_kcfiltered2_metadata.tsv") #adjust path if needed
genes_test = pd.read_table("./Data/Rojahn_kcfiltered2_features.tsv", header=None, index_col=0) #adjust path if needed
genes_test.index.name = None
testing = ad.AnnData(
    X=mtx_test.X.T,
    obs=cells_test,
    var=genes_test,
)

matrix_test = pd.DataFrame.sparse.from_spmatrix(testing.X, index=list(testing.obs.index.values), columns=list(testing.var.index.values))

In [18]:
# run functions:
run_SVM(matrix_train = matrix_train, matrix_test = matrix_test,
        LabelsPathTrain = "./Data/Wang_kc2_labels.csv", #adjust path if needed
        OutputDir = "./Results/RojahnBiop/") #adjust path if needed