In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.decomposition import PCA

import scanpy as sc

from umap import UMAP

In [None]:
def get_index_from_array(a, b):
    """get the indices of b elements in a array
    """
    return pd.Series(a).reset_index().set_index(0).loc[b]['index'].values
    

In [None]:
dfplot = pd.read_csv('/u/home/f/f7xiesnm/v1_multiome/P21_atac_umap.csv')
dfplot

In [None]:
unq_types = np.sort(dfplot['label'].unique())
unq_types

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(
    data=dfplot, x='0', y='1', hue='label', 
    # palette=palette,
    s=3,
    edgecolor='none',
    ax=ax)
ax.set_aspect('equal')
plt.show()

# RNA 

In [None]:
%%time
f = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_P21.h5ad'
adata = sc.read(f) # , backed='r')
adata = adata[adata.obs['Study']=='2023 Multiome']
adata = adata[dfplot['cell'].values]
adata

In [None]:
adata.X = adata.raw.X

In [None]:
# filter genes
cond = np.ravel((adata.X>0).sum(axis=0)) > 10 # expressed in more than 10 cells
adata = adata[:,cond]
genes = adata.var.index.values

# counts
x = adata.X
cov = adata.obs['total_counts'].values

# CP10k
xn = (sparse.diags(1/cov).dot(x))*1e4

# log2(CP10k+1)
log_xn = np.log2(1+np.array(xn.todense()))

adata.layers[ 'lognorm'] = log_xn 

In [None]:
# select HVGs with mean and var
nbin = 20
qth = 0.3

# min
gm = np.ravel(xn.mean(axis=0))

# var
tmp = xn.copy()
tmp.data = np.power(tmp.data, 2)
gv = np.ravel(tmp.mean(axis=0))-gm**2

# cut 
lbl = pd.qcut(gm, nbin, labels=np.arange(nbin))
gres = pd.DataFrame()
gres['name'] = genes
gres['lbl'] = lbl
gres['mean'] = gm
gres['var'] = gv
gres['ratio']= gv/gm

# select
gres_sel = gres.groupby('lbl')['ratio'].nlargest(int(qth*(len(gm)/nbin))) #.reset_index()
gsel_idx = np.sort(gres_sel.index.get_level_values(1).values)
assert np.all(gsel_idx != -1)

In [None]:
adata_hvg = adata[:,gsel_idx]
print(adata_hvg.shape)
genes_hvg = adata_hvg.var.index.values

In [None]:
pca_all = PCA(n_components=20)
pca_all.fit(adata_hvg.layers['lognorm'][...]) #
pcs_all = pca_all.transform(adata_hvg.layers['lognorm'][...])

In [None]:
%%time
ucs_all = UMAP(n_components=2, n_neighbors=30).fit_transform(pcs_all) #

In [None]:
dfplot_rna = pd.DataFrame(ucs_all)
dfplot_rna['label'] = adata_hvg.obs['Subclass'].values
dfplot_rna

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(
    data=dfplot_rna, x=0, y=1, hue='label', 
    # palette=palette,
    s=3,
    edgecolor='none',
    ax=ax)
# ax.set_aspect('equal')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

In [None]:
dfplot_rna

In [None]:
dfplot_rna_atac = dfplot_rna.copy().rename(columns={0: 'r1', 1: 'r2'})
dfplot_rna_atac['a1'] = dfplot['0']
dfplot_rna_atac['a2'] = dfplot['1']
dfplot_rna_atac

In [None]:
dfplot_rna_atac.to_csv('/u/home/f/f7xiesnm/v1_multiome/rna_atac_umap_p21_l23.csv')