In [33]:
# train a SVM classifier on human data
import scanpy as sc
import numpy as np
import pandas as pd
from matplotlib.colors import Normalize

In [34]:
# Train a SVM model on human dataset

In [35]:
# import data 
train_file = "/home/chutianhao/R/Projects/snRNA_scRNA_hcc/project/svm/data/sc_human_train.h5ad"
adata_train = sc.read_h5ad(train_file)

In [36]:
adata_train

AnnData object with n_obs × n_vars = 3395 × 300
    obs: 'nCount_RNA', 'nFeature_RNA', 'patient_id', 'condition', 'mt_ratio', 'ribo_ratio', 'S.Score', 'G2M.Score', 'Phase', 'clusters', 'clusters_2'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized'
    obsm: 'X_pca', 'X_scanvi', 'X_scvi', 'X_umap'

In [37]:
# prepare training data
X = adata_train.X.copy()
Y = adata_train.obs.copy()

In [38]:
group = "clusters_2"
Y = Y.loc[:, group]

In [39]:
Y = Y.values.to_list()

In [40]:
# scale the data before training
from sklearn.preprocessing import StandardScaler

In [41]:
X = X.toarray()
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [42]:
# search the best C and gamma values, using grid search

In [43]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [12]:
C_range = np.logspace(-2, 7, 10)
gamma_range = np.logspace(-9, 0, 10)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(cache_size=400000), param_grid=param_grid, cv=cv)
grid.fit(X, Y)

print(
    "The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'C': 100.0, 'gamma': 0.001} with a score of 0.85


In [1]:
# train the classifier with RBF kernel
from sklearn import svm 

In [45]:
clf = svm.SVC(kernel='rbf', C = 100, gamma=0.001, cache_size=400000, probability=True)

In [46]:
clf.fit(X, Y)

In [47]:
# test the model on test dataset
adata_test = sc.read_h5ad("/home/chutianhao/R/Projects/snRNA_scRNA_hcc/project/svm/data/sc_human_test.h5ad")

In [48]:
adata_test

AnnData object with n_obs × n_vars = 854 × 300
    obs: 'nCount_RNA', 'nFeature_RNA', 'patient_id', 'condition', 'mt_ratio', 'ribo_ratio', 'S.Score', 'G2M.Score', 'Phase', 'clusters', 'clusters_2'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized'
    obsm: 'X_pca', 'X_scanvi', 'X_scvi', 'X_umap'

In [49]:
X_test = adata_test.X.copy()
Y_test = adata_test.obs.copy()

In [50]:
X_test = X_test.toarray()
X_test = scaler.fit_transform(X_test)

In [51]:
X_test.shape

(854, 300)

In [52]:
Y_test = Y_test.loc[:, group]

In [53]:
Y_test = Y_test.values.to_list()

In [54]:
Y_pred = clf.predict(X_test)

In [55]:
Y_proba = clf.predict_proba(X_test)

In [56]:
Y_pred = Y_pred.tolist()

In [57]:
match = [i for i, j in zip(Y_test, Y_pred) if i == j]
nomatch = [i for i, j in zip(Y_test, Y_pred) if i != j]

In [58]:
print(len(match), len(nomatch), len(Y_pred))

784 70 854


In [59]:
# calculate the model precision, recall and F1-score
from sklearn import metrics

In [60]:
print(metrics.classification_report(Y_test, Y_pred, digits=3))

                   precision    recall  f1-score   support

          B cells      0.929     0.648     0.763       122
              CAF      0.992     0.967     0.979       122
Endothelial cells      0.983     0.967     0.975       122
      Hepatocytes      0.974     0.934     0.954       122
    Myeloid cells      0.921     0.951     0.935       122
     Plasma cells      0.992     0.984     0.988       122
             T/NK      0.717     0.975     0.826       122

         accuracy                          0.918       854
        macro avg      0.930     0.918     0.917       854
     weighted avg      0.930     0.918     0.917       854



In [61]:
# save the trained svm model
import pickle

In [62]:
filename = "svm_model_landscape.pickle"

In [63]:
pickle.dump(clf, open(filename, "wb"))
# when load the model, use pickle.load(open(filename, "rb"))