In [6]:
import os
import sys
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.sparse as sps

from os.path import join
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from MAT2 import *

import sys
from pathlib import Path
from os.path import join
cur_dir = Path(os.getcwd())
sys.path.append(str(cur_dir.parent.parent.absolute()))

from moco.kbet import calculate_kbet
from moco.utils import py_read_data, load_meta_txt
from moco.preprocessing import hvgPipe
from scib_eval import scib_eval

data_dir = '/home/yxh/gitrepo/Batch-effect-removal-benchmarking-master/Script/sapling/GLOBE/data'
out_dir = '/home/yxh/gitrepo/Batch-effect-removal-benchmarking-master/Output'

In [2]:

# ================================
# prepare datasets
# ================================
dno = 'Pancreas'
sc.settings.set_figure_params(dpi=80)
sc.settings.figdir = Path(f'/home/yxh/gitrepo/Batch-effect-removal-benchmarking-master/Output/dataset4/MAT')
sc.settings.figdir.mkdir(parents=True, exist_ok=True)

dataset_dir = join(data_dir, dno)
# data_name = 'b1_exprs'
batch_key = 'batch'
label_key = 'type'

sps_x, gene_name, cell_name = py_read_data(dataset_dir, 'myData_pancreatic_5batches')
df_meta = load_meta_txt(join(dataset_dir, 'mySample_pancreatic_5batches.txt'))
df_meta[label_key] = df_meta['celltype']
df_meta[batch_key] = df_meta['batchlb']

The reading cost time 0.1102 secs


In [3]:
# ================================
# preprocessing
# ================================

min_cells = 3
scale_factor = 1e4
n_hvgs = 2000

adata = sc.AnnData(sps.csr_matrix(sps_x.T))  # transposed before
adata.obs_names = cell_name
adata.var_names = gene_name
adata.obs[batch_key] = df_meta.loc[cell_name, batch_key]
# change the batch from number to string 
# adata.obs[batch_key] = adata.obs[batch_key].apply(lambda x: f'Batch{int(x)+1}')
adata.obs[label_key] = df_meta.loc[cell_name, label_key]


sc.pp.filter_genes(adata, min_cells=min_cells) 
sc.pp.normalize_total(adata, target_sum=scale_factor)
sc.pp.log1p(adata)

sc.pp.highly_variable_genes(adata, n_top_genes=min(adata.shape[1]-1, n_hvgs), 
                            min_mean=0.0125, max_mean=3, min_disp=0.5,
                            batch_key=batch_key)

adata = adata[:, adata.var.highly_variable].copy()

data = pd.DataFrame(adata.X.A.T, index=adata.var_names, columns=cell_name)
# prepare anchors
anchor = pd.read_csv(join(dataset_dir, f'seuratAnchors.csv'), header=0, index_col=0)

# compute the absolute cell.idx in the X
name2idx = dict(zip(adata.obs_names, np.arange(adata.shape[0])))
anchor['cell1'] = anchor.name1.apply(lambda x:name2idx[x])
anchor['cell2'] = anchor.name2.apply(lambda x:name2idx[x])

... storing 'batch' as categorical
... storing 'type' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [4]:
# ================================
# training Unsupervised model
# ================================
print('========Unsupervised training========')
lr = 1e-3
eps = [10, 40, 60, 80]
EMBS = []
for ep in eps:
    print('training params ', lr, ep)
    model_unv = BuildMAT2(
                        data=data,
                        metadata=df_meta,
                        anchor=anchor,
                        num_workers=6,
                        use_gpu=True,
                        mode='manual',
                        latent_num = 20,
                        learning_rate = lr,
                        batch_size = 256,
                        norm = 'l1',
                        weight_decay = 0.01)
    model_unv.train(epochs=ep, curve=False, dec_train=True)

    rec_unv = model_unv.evaluate(data)
    EMBS.append(rec_unv)

training params  0.0001 10



Training finish!

training params  0.0001 20



Training finish!

training params  0.0001 40



Training finish!

training params  0.0001 60



Training finish!



NameError: name 'hvgPipe' is not defined

In [8]:
RES = None
for i,ep in enumerate(eps):
    ad_tmp = hvgPipe(EMBS[i], meta=df_meta, scale=True, n_neighbors=15, npcs=50, umap=True)
    ad_tmp.obsm['X_emb'] = ad_tmp.obsm['X_pca']

    tmp_res = scib_eval(ad_tmp, batch_key, label_key)

    RES = tmp_res if RES is None else RES.merge(tmp_res, left_index=True, right_index=True, how='inner')

RES.columns = eps

NMI...
ARI...
Silhouette score...
Calculating kbet...
Making the column batch of adata.obs categorical.
NMI...
ARI...
Silhouette score...
Calculating kbet...
Making the column batch of adata.obs categorical.
NMI...
ARI...
Silhouette score...
Calculating kbet...
Making the column batch of adata.obs categorical.
NMI...
ARI...
Silhouette score...
Calculating kbet...
Making the column batch of adata.obs categorical.


In [9]:
RES

Unnamed: 0,10,20,40,60
NMI_cluster/label,0.893826,0.89538,0.899471,0.890757
ARI_cluster/label,0.937854,0.940682,0.943553,0.940851
ASW_label,0.569635,0.568234,0.570562,0.59667
ASW_label/batch,0.832243,0.848692,0.841847,0.84175
isolated_label_F1,,,,
isolated_label_silhouette,,,,
graph_conn,,,,
trajectory,,,,
kBET,0.354845,0.371165,0.401571,0.400826
