# TransBE

## 01 scRNA-seq DataArry

In [49]:
from scipy.sparse import coo_matrix
from scipy.io import mmread
import os
import pandas as pd

def mtx2csv(Dir="./",Output="./Output.csv.gz"):
    Data = mmread(os.path.join(Dir, "matrix.mtx"))
    Gene_Name = pd.read_table(os.path.join(Dir, "genes.tsv"),header=None)
    Cell_Name = pd.read_table(os.path.join(Dir, "barcodes.tsv"),header=None)
    Matrix = pd.DataFrame(Data.toarray().T, index=Cell_Name[0].to_list(), columns=Gene_Name[1].to_list())
    Matrix.to_csv(Output, compression='gzip')

mtx2csv(Dir="../../../Experiment/ICC-Data/RAW/GSE125449/S2/",Output="../Data/Sample2.csv.gz")

## 02 不同批次数据比较

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import warnings
warnings.filterwarnings('ignore')

def Classification(filepath):
    Data = pd.read_csv(filepath, index_col=0)

    adata = sc.read(filepath)
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_cells(adata, max_genes=30000)
    sc.pp.filter_genes(adata, min_cells=3)
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.n_genes_by_counts < 2500, :]
    adata = adata[adata.obs.pct_counts_mt < 5, :]
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    adata = adata[:, adata.var.highly_variable]
    sc.pp.scale(adata, max_value=10)
    sc.tl.pca(adata)
    sc.pp.neighbors(adata)
    sc.tl.leiden(adata,resolution=0.95)
    
    Leiden = pd.DataFrame(adata.obs["leiden"])
    Data = pd.concat([Data, Leiden], axis=1, join="inner")
    return Data

Data1 = Classification("../Data/Sample1.csv.gz")
Data2 = Classification("../Data/Sample2.csv.gz")
Keys = list(set(Data1.columns) & set(Data2.columns))
Data1 = Data1[Keys]
Data2 = Data2[Keys]


In [2]:
Data1 = Data1.sort_values("leiden")
Data2 = Data2.sort_values("leiden")

In [3]:
# > Calculate Loss
#   1. Identify the nearest cluster
#   2. calculate distences
#   3. inner divided outer

import numpy as np
from sklearn.decomposition import PCA
from tqdm import tqdm


def Calculate_Distance(A, B):
    # A is a matrix which size is (m, k)
    # B is a matrix which size is (n, k)
    # D is a matrix which size is (m, n)
    m = np.shape(A)[0]
    n = np.shape(B)[0]
    D = np.zeros((m,n))

    M = np.dot(A, B.T)
    H1 = np.square(A).sum(axis=1) # (1, n) Vector
    H2 = np.square(B).sum(axis=1) # (1, m) Vector
    D = np.sqrt(-2*M + np.matrix(H2) + np.matrix(H1).T)
    return D

def Calculate_PCA(Data, N):
    pca = PCA(n_components=N)
    pca_results = pca.fit(Data)
    D = pca.fit_transform(Data)
    return D

def Calculate_Normalize(Matrix):
    # Normalize
    for index in range(Matrix.shape[0]):
        Line = Matrix.iloc[index,:]
        Min = min(Line.to_list())
        Max = max(Line.to_list())
        Matrix.iloc[index,:] = (Line - Min) / (Max-Min)

def Calculate_Loss(Data1, Data2):
    D1 = Calculate_PCA(np.array(Data1.drop(["leiden"], axis=1)), N=50)
    D2 = Calculate_PCA(np.array(Data2.drop(["leiden"], axis=1)), N=50)
    Distance_Matrix = Calculate_Distance(D1, D2)
    Distance_Matrix = pd.DataFrame(Distance_Matrix)
    Distance_Matrix = Distance_Matrix.set_index(Data1.index)
    Distance_Matrix.columns = Data2.index
    Distance_Matrix = Calculate_Normalize(Distance_Matrix)

    Loss_list = []
    for i, Barcode2 in enumerate(tqdm(list(Distance_Matrix.columns))):
        Closest_point = Distance_Matrix[Barcode2].sort_values().index[0]
        c1 = Data1.loc[Closest_point, "leiden"]
        Barcode1 = list(Data1[Data1["leiden"]==c1].index)
        Distance_In_Group = Distance_Matrix.loc[Barcode1, Barcode2].mean()
        Distance_Out_Group = Distance_Matrix.drop(Barcode1)[Barcode2].mean()
        Loss = Distance_In_Group/Distance_Out_Group
        Loss_list.append(Loss)
    Loss_all = np.mean(Loss_list)
    return Loss_all

## 03 Transformer 模型构建