# Imputation and DGE analysis

In [None]:
import os, sys, glob, re, math, pickle
import scprep, magic, phate
import numpy as np
import pandas as pd
from scipy import sparse as sp
import time,random,datetime
import scanpy as sc
import anndata
from typing import Dict, Optional
import tables
import seaborn as sns
from bbknn import bbknn
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext memory_profiler

# reproducibility
rs = np.random.seed(42)

# fps
dfp = '/gpfs/gibbs/pi/lim_janghoo/cl2292/'
pfp = '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/results/'
pdfp = '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/'
sc.settings.figdir = pfp

# settings
plt.rc('font', size = 8)
plt.rc('font', family='sans serif')
plt.rcParams['pdf.fonttype']=42
plt.rcParams['ps.fonttype']=42
plt.rcParams['text.usetex']=False
plt.rcParams['legend.frameon']=False
plt.rcParams['axes.grid']=False
plt.rcParams['legend.markerscale']=0.5
sc.set_figure_params(dpi=300,dpi_save=600,
                     frameon=False,
                     fontsize=8)
plt.rcParams['savefig.dpi']=600
sc.settings.verbosity=2
sc._settings.ScanpyConfig.n_jobs=-1

In [None]:
# loader
data_folders = [
    #30W
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/Fl2505', 
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/Fl4431',
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/Fl4433', 
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/Fl6355', 
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/FlCre2510',
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/FlCre4430',
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/FlCre4432',
    '/gpfs/gibbs/pi/lim_janghoo/cl2292/SCA1_OL/data/FlCre6354',
]



files_not_found = []
for i in data_folders :
    if not os.path.exists(i) :
        files_not_found.append(i)
    if not files_not_found == [] :
        print('Folders not found...')
        for j in files_not_found :
            print(j)
        raise IOError('Change path to data')

total = time.time()

In [None]:
# Load background-removed samples (CellBender)

running_cellcount=0
start = time.time()
adatas = {}
for i,folder in enumerate(data_folders) :
    sample_id = os.path.split(folder)[1][-4:]
    print(sample_id)
    print('... storing %s into dict (%d/%d)' % (sample_id,i+1,len(data_folders)))
    adatas[sample_id] = sc.read_10x_h5(folder+'/cellbender_filtered.h5')
    running_cellcount+=adatas[sample_id].shape[0]
    print('...     read {} cells; total: {} in {:.2f}-s'.format(adatas[sample_id].shape[0],running_cellcount,time.time()-start))
batch_names = list(adatas.keys())

In [None]:
## Scrublet

for i in batch_names:
    sc.external.pp.scrublet(adatas[i])
    adatas[i].var_names_make_unique()
    adatas[i].obs_names_make_unique()
    
adata_list = list(adatas.values())
adata = adata.concat(adata_list, label='batch', keys=batch_names)
adata.obs

print('Ncells=%d are doublets' % np.sum(adata.obs.predicted_doublet != False))
#adata = adata[adata.obs.predicted_doublet == False, :]
adata

In [None]:
adata_list = list(adatas.values())
adata = anndata.concat(adata_list, label='batch', keys=batch_names)
adata.obs

print('Ncells=%d are doublets' % np.sum(adata.obs.predicted_doublet != False))
#adata = adata[adata.obs.predicted_doublet == False, :]
adata

In [None]:
# annotate metadata

fl = ['2505', '4431','4433','6355', #30W
     ] 
flcre = ['2510','4430','4432','6354', #30W

        ]

wk30 = ['2505', '4431','4433','6355',
        '2510','4430','4432','6354',
       ]

genotype = []
for i in adata.obs['batch'] : # verbose loop for quality-assurance
    if i in fl :
        genotype.append('SCA1-fl/+')
    elif i in flcre :
        genotype.append('SCA1-fl/NG2-Cre')
    else :
        raise ValueError('Encountered unclassifiable genotype for sample {}'.format(i))
adata.obs['genotype']=genotype



timepoint = []
for i in adata.obs['batch'] :
    if i in wk30 :
        timepoint.append('30wk')
    else :
        raise ValueError('Encountered unclassifiable timepoint for animal {}'.format(i))
adata.obs['timepoint']=timepoint


print(adata)

In [None]:
adata.obs['batch']

In [None]:
adata = adata[adata.obs.predicted_doublet == False, :].copy()
adata
del adata.obs['predicted_doublet']

In [None]:
# filter cells/genes, transform
adata.var['mt'] = adata.var_names.str.startswith('mt-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
print('Ncells=%d have >10 percent mt expression' % np.sum(adata.obs['pct_counts_mt']>10))
print('Ncells=%d have <500 genes expressed' % np.sum(adata.obs['n_genes_by_counts']<500))

In [None]:
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=3) # filtering cells gets rid of some genes of interest
adata = adata[adata.obs.pct_counts_mt <= 10, :]

In [None]:
# normalization
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.sqrt(adata,chunked=True,chunk_size=10000)
adata.raw = adata

In [None]:

# calc embeddings for batch corrected
start = time.time()
print('starting embeddings...')
sc.tl.pca(adata,n_comps=100)
#sc.external.pp.bbknn(adata,batch_key='batch')
bbknn(adata,batch_key='batch') #pip install bbknn
#     sc.pp.neighbors(adata, n_neighbors=100, n_pcs=100)
sc.tl.leiden(adata,resolution=3) #pip install python-igraph #pip install louvain
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color = ['genotype','timepoint'])

In [None]:
adata

In [None]:
print(adata.obs['genotype'].value_counts())
print('')
print(adata.obs['timepoint'].value_counts())

In [None]:
# markers = {
#     'Neuron':['Rbfox3','Snap25'],
#     'Astrocyte':['Slc1a2','Slc1a3','Agt','Aldh1l1','Gfap', 'Slc7a10','Aqp4'],
#     'OPC':['Pdgfra','Cspg4','Olig1','Olig2',], 
#     'COP':['Gpr17','Fyn','Tcf7l2'],
#     'OL' : ['Plp1','Mag','Mog','Opalin','Mbp','Mobp','Hapln2','Dpy19l1','Cyp27a1','Rab37','Klk6'],
#     'Microglia':['Cx3cr1','P2ry12','C1qb','Ctss'], #'Trem2','Aif1','Ptprc'
#     'macrophage':['Mrc1', 'Cd74','H2-Ab1'],
#     't cells':['Ptprc','Ms4a4b', 'Cd52','Nkg7','Cd3g'],
#     'Menninge':['Dcn','Col3a1'], #'Slc47a1','Bicc1','Pdzrn3','Col25a1'
#     'Ependymal':['Cfap43','Dnah12',], #'Foxj1', 'Rarres2', 'Ccdc153','Tmem212','Tm4sf1','Mia','Acta2'
#     'Pericytes':['Abcc9','Notch3'],
#     'Endothelial cell':['Flt1','Ly6c1','Cldn5'],

# }

markers = {'Granule cell':['Gabra6','Slc17a7'],
                'DCN':['Slc17a6'],
                'UBC':['Eomes'],
                'Purkinje cell':['Atp2a3','Calb1','Car8','Ppp1r17','Slc1a6'],
                'MLI1':['Ptprk','Adgrl3'],
                'MLI2':['Nxph1','Cdh22'],
                'Golgi':['Gad1','Gad2','Lgi2','Nrg1','Pax2'],
                'Astrocyte':['Aldh1l1','Aqp4','Slc1a3'],
                'Bergmann glia':['Gdf10','Hopx','Timp4'],
                'OPC':['Pdgfra','Cspg4','Olig1','Olig2',],
                'OL':['Hapln2','Mag','Mog','Opalin'],
                'Microglia':['C1qb','Cx3cr1','Dock2','P2ry12'],
                'Pericytes':['Flt1','Pdgfrb','Rgs5'],
                'Endothelial':['Dcn','Lum']} 

In [None]:
#sc.tl.dendrogram(adata, groupby ='leiden')
sc.pl.dotplot(adata, markers, groupby = 'leiden', dendrogram = True, standard_scale='var')

In [None]:
sc.pl.umap(adata, color = ['leiden'])

In [None]:
# annotate cell type
pc = ['38']
gc = ['18','9','0','8','19','48','2','27','1','4','11','24','20','3','12','13','30','6','7','32','5','23','17','10','15','28','14','22']
dcn = ['37']
ubc=['44']
mli1=['21']
mli2=['39']
goc=['45']
ast =['34','26','43']
bg = ['35','16']
opc = ['46']
ol = ['25','31']
mg = ['47','50']
per =['36','51']
end = ['40']
other = ['49','33','42','29','41']


ctype = []
for i in adata.obs['leiden'] : # verbose loop for quality-assurance
    if i in pc :
        ctype.append('PC') 
    elif i in gc :
        ctype.append('GC')
    elif i in dcn :
        ctype.append('DCN')
    elif i in ubc :
        ctype.append('UBC')
    elif i in mli1 :
        ctype.append('MLI1')
    elif i in mli2 :
        ctype.append('MLI2')
    elif i in goc :
        ctype.append('GoC') 
    elif i in ast :
        ctype.append('AS')
    elif i in bg :
        ctype.append('BG')
    elif i in opc :
        ctype.append('OPC')
    elif i in ol :
        ctype.append('OL')   
    elif i in mg :
        ctype.append('MG')
    elif i in per :
        ctype.append('PER')
    elif i in end :
        ctype.append('END')    
    elif i in other :
        ctype.append('Other')
    else :
        raise ValueError('Encountered unclassifiable cell type for sample {}'.format(i))
adata.obs['ctype']=ctype

In [None]:
sc.pl.umap(adata, color = ['ctype'])

In [None]:
print(adata.obs['ctype'].value_counts())

In [None]:
adata = adata[adata.obs['ctype']!='Other',:]

In [None]:
cmap_ctype={'GC': '#FAC18A',
            'DCN': '#AAABAB',
            'UBC': '#BA61BA',
            'PC': '#EE5264',
            'MLI1': '#F9EBAE',
            'MLI2': '#88BB92',
            'GoC': '#46A928',
            'AS': '#F9AEAE',
            'BG': '#AEB7F9',
            'OPC': '#F1815F',
            'OL': '#75A3B7',
            'MG': '#AC5861',
            'PER': '#2D284B',
            'END': '#1C67EE',}

adata.uns['ctype_colors']=np.array(list(cmap_ctype.values()),dtype=object)
adata.obs['ctype']=adata.obs['ctype'].cat.reorder_categories(new_categories=list(cmap_ctype.keys()),
                                         ordered=True)

cmap_genotype={'SCA1-fl/+': '#4683B5',
            'SCA1-fl/NG2-Cre':'#FFA600'}

adata.uns['genotype_colors']=np.array(list(cmap_genotype.values()),dtype=object)
adata.obs['genotype']=adata.obs['genotype'].cat.reorder_categories(new_categories=list(cmap_genotype.keys()),
                                         ordered=True)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color = 'genotype',
           save = '_250414_OL-SCA-cKI_genotype.pdf'
          )
sc.pl.umap(adata, color = 'ctype', 
           save = '_240414_OL-SCA-cKI_ctype.pdf'
          )

In [None]:
# save data objects
adata.write(os.path.join(pdfp,'250414_OL-SCA1-cKI.h5ad'))
print('saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))

In [None]:
if True :
    start = time.time()
    backed=None # None if not
    fname='250414_OL-SCA1-cKI.h5ad' # for full, can maybe get away with ~300G
    %memit adata = sc.read_h5ad(os.path.join(pdfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))

In [None]:
cell_counts = adata.obs.groupby(['ctype','genotype','batch']).size().reset_index(name='cell_count')
total_counts = adata.obs.groupby(['genotype','batch']).size().reset_index(name='total_cells')
cell_counts = cell_counts.merge(total_counts, on=['genotype','batch'])
cell_counts['proportion(%)'] = cell_counts['cell_count']*100 / cell_counts['total_cells']
cell_counts = cell_counts.sort_values(by=['ctype', 'genotype','batch'])
cell_counts
cell_counts.to_csv(os.path.join(pfp,'250415_OL-SCA1-cKI_ctype proportion.csv'),index=False)

In [None]:
cmap_ctype={'GC': '#FAC18A',
            'DCN': '#AAABAB',
            'UBC': '#BA61BA',
            'PC': '#EE5264',
            'MLI1': '#F9EBAE',
            'MLI2': '#88BB92',
            'GoC': '#46A928',
            'AS': '#F9AEAE',
            'BG': '#AEB7F9',
            'OPC': '#F1815F',
            'OL': '#75A3B7',
            'MG': '#AC5861',
            'PER': '#2D284B',
            'END': '#1C67EE',}

fig, ax = plt.subplots(1,1, figsize=(4,2))
sns.violinplot(x='ctype', y='n_genes', data=adata.obs, dodge=False, ax=ax, hue='ctype', palette=cmap_ctype,
               linewidth =1,
              )
# ax.set_yscale('log')
ax.legend().set_visible(False)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel('')
#ax.set_style("whitegrid")
fig.savefig(os.path.join(pfp, '250414_n_genes_by_ctype.pdf'))

fig, ax = plt.subplots(1,1, figsize=(4,2))

sns.barplot(x='ctype', 
               y='total_counts', 
               data=adata.obs, 
               dodge=False, 
               ax=ax, 
               hue='ctype', 
               palette=cmap_ctype,
               capsize=.2,
               linewidth=1,
               edgecolor='black',
               errwidth=1,
#               order=order_human
           )

# ax.set_ylim([0, 50000])
# ax.set_yscale('log')
ax.legend().set_visible(False)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
#ax.set_style("whitegrid")

fig.savefig(os.path.join(pfp, '250414_umi_count_by_ctype.pdf'))