In [1]:
import pandas as pd
import numpy as np
import os
import sys
import scanpy as sc
import anndata as ad
import scipy
import gget
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable

# local 
import utils as ut

In [2]:
# load predictions
fpath = "/nfs/turbo/umms-indikar/shared/projects/MC3R/hypomap/merged_predictions.csv"
df = pd.read_csv(fpath)
print(f"{df.shape=}")

for c in df.columns:
    if not c == 'index':
        df[f'{c}_clean'] = df[c].apply(lambda x: x.split(":")[1]).astype(str)


# load clustering results
fpath = "/nfs/turbo/umms-indikar/shared/projects/MC3R/results/embedding.csv"
cdf = pd.read_csv(fpath)
print(f"{cdf.shape=}")

# merge 'em'
df = pd.merge(df, cdf, 
              how='left',
              left_on='index',
              right_on='index')


# clean-up the cell type
df = df.set_index('index')


print(f"{df.shape=}")
df.head()

df.shape=(43312, 5)
cdf.shape=(43312, 3)
df.shape=(43312, 10)


Unnamed: 0_level_0,reference_embedding_C7_named,reference_embedding_C25_named,C25_named,C7_named,reference_embedding_C7_named_clean,reference_embedding_C25_named_clean,C25_named_clean,C7_named_clean,UMAP1,UMAP2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAACCCACATGGCTGC-1_fasted_m,C7-2: GABA,C25-11: GABA-1,C25-10: GABA-2,C7-2: GABA,GABA,GABA-1,GABA-2,GABA,-6.74196,14.08305
AAACCCAGTATATGGA-1_fasted_m,C7-2: GABA,C25-13: GABA-4,C25-1: GLU-1,C7-2: GABA,GABA,GABA-4,GLU-1,GABA,2.378481,2.956433
AAACCCAGTCAACACT-1_fasted_m,C7-5: Immune,C25-21: Immune,C25-21: Immune,C7-5: Immune,Immune,Immune,Immune,Immune,1.525248,-6.143299
AAACCCATCCCGTGAG-1_fasted_m,C7-4: Oligo+Precursor,C25-19: Oligodendrocytes,C25-19: Oligodendrocytes,C7-4: Oligo+Precursor,Oligo+Precursor,Oligodendrocytes,Oligodendrocytes,Oligo+Precursor,15.965842,14.408034
AAACCCATCGAACTCA-1_fasted_m,C7-1: GLU,C25-2: GLU-2,C25-2: GLU-2,C7-1: GLU,GLU,GLU-2,GLU-2,GLU,-2.045243,7.478702


In [3]:
dirpath = "/nfs/turbo/umms-indikar/shared/projects/MC3R/h5ad_files/"

data = {}

for f in os.listdir(dirpath):
    fullpath = f"{dirpath}{f}"
    key = f.replace(".h5ad", "")

    batch_andata = sc.read(fullpath)
    batch_andata.obs['batch'] = key
    data[key] = batch_andata
    
adata = ad.concat(data, index_unique="_") # combine all the experiments
adata.var_names_make_unique()
adata

AnnData object with n_obs × n_vars = 43312 × 32285
    obs: 'batch'

In [4]:
# add the metadata into the object
adata.obs = adata.obs.join(df, 
                           how='left')

adata.obs['Diet'] = adata.obs['batch'].apply(lambda x: x.split("_")[0])
adata.obs['Sex'] = adata.obs['batch'].apply(lambda x: x.split("_")[1])

adata.obsm['X_umap'] = adata.obs[['UMAP1', 'UMAP2']].copy().to_numpy()

adata.obs.head()

Unnamed: 0,batch,reference_embedding_C7_named,reference_embedding_C25_named,C25_named,C7_named,reference_embedding_C7_named_clean,reference_embedding_C25_named_clean,C25_named_clean,C7_named_clean,UMAP1,UMAP2,Diet,Sex
AAACCCACATGGCTGC-1_fasted_m,fasted_m,C7-2: GABA,C25-11: GABA-1,C25-10: GABA-2,C7-2: GABA,GABA,GABA-1,GABA-2,GABA,-6.74196,14.08305,fasted,m
AAACCCAGTATATGGA-1_fasted_m,fasted_m,C7-2: GABA,C25-13: GABA-4,C25-1: GLU-1,C7-2: GABA,GABA,GABA-4,GLU-1,GABA,2.378481,2.956433,fasted,m
AAACCCAGTCAACACT-1_fasted_m,fasted_m,C7-5: Immune,C25-21: Immune,C25-21: Immune,C7-5: Immune,Immune,Immune,Immune,Immune,1.525248,-6.143299,fasted,m
AAACCCATCCCGTGAG-1_fasted_m,fasted_m,C7-4: Oligo+Precursor,C25-19: Oligodendrocytes,C25-19: Oligodendrocytes,C7-4: Oligo+Precursor,Oligo+Precursor,Oligodendrocytes,Oligodendrocytes,Oligo+Precursor,15.965842,14.408034,fasted,m
AAACCCATCGAACTCA-1_fasted_m,fasted_m,C7-1: GLU,C25-2: GLU-2,C25-2: GLU-2,C7-1: GLU,GLU,GLU-2,GLU-2,GLU,-2.045243,7.478702,fasted,m


In [5]:
"""some simple processing """

adata.layers["counts"] = adata.X.copy() # store the raw counts

""" Add a metadata column """
adata.obs['broad_type'] = adata.obs['reference_embedding_C25_named_clean'].apply(lambda x: x.split("-")[0])

sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=10)
sc.pp.calculate_qc_metrics(adata)

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# sc.pp.combat(adata) # regress out batch effects

sc.pp.highly_variable_genes(adata, 
                            min_mean=0.0125, 
                            max_mean=3, 
                            min_disp=0.5)

sc.tl.pca(adata, n_comps=25)

adata

AnnData object with n_obs × n_vars = 42937 × 25060
    obs: 'batch', 'reference_embedding_C7_named', 'reference_embedding_C25_named', 'C25_named', 'C7_named', 'reference_embedding_C7_named_clean', 'reference_embedding_C25_named_clean', 'C25_named_clean', 'C7_named_clean', 'UMAP1', 'UMAP2', 'Diet', 'Sex', 'broad_type', 'n_genes'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca'
    obsm: 'X_umap', 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [6]:
"""A metdata columns for a few genes """

label_genes = [
    'Sun1',
    'Mc3r',
    'Agrp',
    'Pomc',
    'Lepr',
]

[print(x) for x in label_genes if not x in adata.var_names]

for gene in label_genes:
    mask = np.ravel(adata[:, [gene]].layers['counts'].todense() > 0)
    adata.obs[gene] = np.where(mask, 1, 0)
    adata.obs[gene] = adata.obs[gene].astype('category')
    print(f"-------- {gene} --------")
    print(adata.obs[gene].value_counts())
    print()

print('Done')

-------- Sun1 --------
1    21714
0    21223
Name: Sun1, dtype: int64

-------- Mc3r --------
0    42175
1      762
Name: Mc3r, dtype: int64

-------- Agrp --------
0    25432
1    17505
Name: Agrp, dtype: int64

-------- Pomc --------
0    31697
1    11240
Name: Pomc, dtype: int64

-------- Lepr --------
0    35836
1     7101
Name: Lepr, dtype: int64

Done


In [7]:
key = 'Diet'
# get marker genes
sc.tl.rank_genes_groups(adata, 
                        groupby=key,
                        method='wilcoxon',
                        key_added=key,
                        corr_method='benjamini-hochberg')

deg = sc.get.rank_genes_groups_df(adata, 
                                  group='fasted',
                                  key=key)

deg.head()

... storing 'batch' as categorical
... storing 'reference_embedding_C7_named' as categorical
... storing 'reference_embedding_C25_named' as categorical
... storing 'C25_named' as categorical
... storing 'C7_named' as categorical
... storing 'reference_embedding_C7_named_clean' as categorical
... storing 'reference_embedding_C25_named_clean' as categorical
... storing 'C25_named_clean' as categorical
... storing 'C7_named_clean' as categorical
... storing 'Diet' as categorical
... storing 'Sex' as categorical
... storing 'broad_type' as categorical


Unnamed: 0,names,scores,logfoldchanges,pvals,pvals_adj
0,Srsf10,55.261013,0.84913,0.0,0.0
1,Gm26871,52.240036,0.927264,0.0,0.0
2,Npy,49.007404,2.572269,0.0,0.0
3,Peg3,48.906406,0.622618,0.0,0.0
4,Gabrb3,45.597534,0.632454,0.0,0.0


In [8]:
alpha = 0.05
n_print = 20
pdf = deg.copy()
pdf = pdf.sort_values(by='logfoldchanges', ascending=False)
pdf = pdf[pdf['pvals_adj'] <= alpha]

print(f"----- Upregulated genes in fasted cells -----")
print(pdf[['names', 'logfoldchanges']].head(n_print))

print()
print(f"----- Downregulated genes in fasted cells -----")
print(pdf[['names', 'logfoldchanges']].tail(n_print))


----- Upregulated genes in fasted cells -----
        names  logfoldchanges
5436  Slco1a1        6.442300
2087   Atp1a4        3.649169
6071     Hao1        3.141992
6201  Olfr920        3.001123
2         Npy        2.572269
3172    Meiob        2.544060
4555  Gm47214        2.501366
2346   Cdkn1a        2.418248
4539  Gm31456        2.405923
3771  Gm20647        2.302967
5466    Ccl17        2.301496
5792    Tigit        2.240003
6099  Gm49735        2.205239
6251    Tex35        2.151994
681     Casq1        2.130793
6862  Gm10280        2.045630
1679    Corin        2.039736
5847  Gm13544        2.028910
1040  Gm44577        1.928171
4755    Itgad        1.839876

----- Downregulated genes in fasted cells -----
               names  logfoldchanges
23842        Gm13905       -1.552601
23607          Gpr17       -1.559155
24990           Gjc3       -1.565940
24382        Gm10863       -1.579214
24611          Lpar1       -1.588886
23845        Gm48370       -1.590613
23951        Gm1

# Load ISO Data

In [20]:
gene_path = "/nfs/turbo/umms-indikar/shared/projects/MC3R/isoforms/mc3r_iso/references/transcript_table.tsv"

gf = pd.read_csv(gene_path, sep='\t')
gf.head()

Unnamed: 0,transcript_id,gene_id,gene_name,gene_biotype
0,ENSMUST00000194081.2,ENSMUSG00000104478,Gm38212,TEC
1,ENSMUST00000194393.2,ENSMUSG00000104385,Gm7449,processed_pseudogene
2,ENSMUST00000185509.2,ENSMUSG00000101231,Gm28283,processed_pseudogene
3,ENSMUST00000194605.2,ENSMUSG00000102135,Gm37108,processed_pseudogene
4,ENSMUST00000191703.2,ENSMUSG00000103282,Gm37275,processed_pseudogene


In [24]:
data_dir = "/nfs/turbo/umms-indikar/shared/projects/MC3R/isoforms/mc3r_iso/salmon"

df = []

for subdir in os.listdir(data_dir):
    cond, celltype = subdir.split("_")
    diet = cond[:-1]
    sex = cond[-1]
    fpath = os.path.sep.join([data_dir, subdir, 'quant.sf'])
    tmp = pd.read_csv(fpath, sep='\t')
    tmp['sex'] = sex
    tmp['diet'] = diet
    tmp['cellType'] = celltype
    df.append(tmp)

df = pd.concat(df)
print(f"{df.shape=}")

df = pd.merge(df, gf, how='left',
               left_on='Name',
               right_on='transcript_id')

df['gene_reads'] = df.groupby(['sex', 'diet', 'cellType', 'gene_id'])['NumReads'].transform('sum')
df['n_transcripts'] = df.groupby(['sex', 'diet',  'cellType', 'gene_id'])['Name'].transform('nunique')

print(f"{df.shape=}")
df.head()

df.shape=(1858848, 8)
df.shape=(1858848, 14)


Unnamed: 0,Name,Length,EffectiveLength,TPM,NumReads,sex,diet,cellType,transcript_id,gene_id,gene_name,gene_biotype,gene_reads,n_transcripts
0,ENSMUST00000196221.2,9,1.818,0.0,0.0,m,fed,oligoprecursor,ENSMUST00000196221.2,ENSMUSG00000096749,Trdd1,TR_D_gene,0.0,2.0
1,ENSMUST00000179664.2,11,1.954,0.0,0.0,m,fed,oligoprecursor,ENSMUST00000179664.2,ENSMUSG00000096749,Trdd1,TR_D_gene,0.0,2.0
2,ENSMUST00000177564.2,16,2.119,0.0,0.0,m,fed,oligoprecursor,ENSMUST00000177564.2,ENSMUSG00000096176,Trdd2,TR_D_gene,0.0,1.0
3,ENSMUST00000178537.2,12,2.002,0.0,0.0,m,fed,oligoprecursor,ENSMUST00000178537.2,ENSMUSG00000095668,Trbd1,TR_D_gene,0.0,1.0
4,ENSMUST00000178862.2,14,2.071,0.0,0.0,m,fed,oligoprecursor,ENSMUST00000178862.2,ENSMUSG00000094569,Trbd2,TR_D_gene,0.0,1.0


In [85]:
ct = 'glu'

query_genes = ['Foxp2']

cdf = df[df['cellType'] == ct]

# cdf = cdf[cdf['gene_name'].isin(query_genes)]
cdf = pd.pivot_table(cdf, index=['Name', 'gene_name'], 
                     columns=['diet'],
                     values='TPM').reset_index(drop=False)

cdf['log2foldchange_tx'] = np.log2((cdf['fasted'] + 1) / (cdf['fed'] + 1))

cdf['fed_reads'] = cdf.groupby('gene_name')['fed'].transform('sum')
cdf['fasted_reads'] = cdf.groupby('gene_name')['fasted'].transform('sum')

cdf['log2foldchange_gx'] = np.log2((cdf['fasted_reads'] + 1) / (cdf['fed_reads'] + 1))

cdf.head()

diet,Name,gene_name,fasted,fed,log2foldchange_tx,fed_reads,fasted_reads,log2foldchange_gx
0,ENSMUST00000000001.5,Gnai3,8.045216,9.143902,-0.165386,9.143902,8.045216,-0.165386
1,ENSMUST00000000003.14,Pbsn,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSMUST00000000010.9,Hoxb9,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSMUST00000000028.14,Cdc45,0.4822,0.550263,-0.064773,0.580306,0.571745,-0.007837
4,ENSMUST00000000033.12,Igf2,0.149939,0.09263,0.073751,0.252068,0.149939,-0.122757


In [86]:
test = cdf[cdf['log2foldchange_gx'].abs() < 0.25]
print(f"{test.shape=}")

test = test.sort_values(by='log2foldchange_tx', ascending=False)
test.head(20)


test.shape=(89844, 8)


diet,Name,gene_name,fasted,fed,log2foldchange_tx,fed_reads,fasted_reads,log2foldchange_gx
114652,ENSMUST00000238271.2,Calm2,63.584063,0.0,6.013106,424.192488,504.155276,0.248611
107113,ENSMUST00000227737.2,App,42.378974,1.070325,4.389067,230.348632,221.056942,-0.059139
88008,ENSMUST00000201727.4,Tbc1d19,14.898556,0.0,3.990824,64.120942,74.713161,0.217423
54376,ENSMUST00000147430.8,Tpm3,14.857665,0.0,3.987108,161.297057,164.471848,0.027949
69564,ENSMUST00000169961.3,Ccdc82,9.465868,0.0,3.38762,150.047326,145.86484,-0.040512
62665,ENSMUST00000159300.8,Slf1,9.125018,0.0,3.339853,91.579898,97.020482,0.082384
59589,ENSMUST00000153700.9,Oaz2,8.883006,0.0,3.30495,37.551007,34.862938,-0.104275
70074,ENSMUST00000170862.8,Csnk1a1,7.762659,0.0,3.131369,146.275339,144.859989,-0.013932
92046,ENSMUST00000207428.2,Pet100,7.246181,0.0,3.043726,703.439372,729.368382,0.052149
106430,ENSMUST00000226851.2,Ywhaz,18.08144,1.324051,3.037456,93.891284,105.541721,0.167071


In [87]:
# db = 'GO_Biological_Process_2023'
db = 'GO_Biological_Process_2021'
# db = 'KEGG_2021_Human'
# db = 'TRANSFAC_and_JASPAR_PWMs'

e_alpha = 0.05
n_query = 100

query_genes = test['gene_name'].head(n_query).unique()

edf = gget.enrichr(query_genes, database=db)
pd.set_option('display.max_colwidth', 150)
print(edf[['path_name', 'adj_p_val', 'overlapping_genes']].head(30))
print()

Thu Oct 19 16:37:20 2023 INFO Performing Enichr analysis using database GO_Biological_Process_2021.


                                                                                                                    path_name  \
0                                                                                                 macroautophagy (GO:0016236)   
1                                                                                                phosphorylation (GO:0016310)   
2                                                                                  spliceosomal complex assembly (GO:0000245)   
3                                                                                         plasma membrane repair (GO:0001778)   
4                                                                                    polyamine metabolic process (GO:0006595)   
5                                                       regulation of alternative mRNA splicing, via spliceosome (GO:0000381)   
6                                  positive regulation of establishment of protein localization t

In [64]:
test2 = cdf[cdf['gene_name'] == 'App']
test2

diet,Name,gene_name,fasted,fed,log2foldchange_tx,fed_reads,fasted_reads,log2foldchange_gx
106390,ENSMUST00000226801.2,App,0.0,0.287413,-0.364474,230.348632,221.056942,-0.059139
107705,ENSMUST00000228509.2,App,11.358799,10.128483,0.151282,230.348632,221.056942,-0.059139
107608,ENSMUST00000228375.2,App,0.559773,0.879367,-0.268912,230.348632,221.056942,-0.059139
105938,ENSMUST00000226232.2,App,0.0,0.0,0.0,230.348632,221.056942,-0.059139
735,ENSMUST00000005406.12,App,79.01666,82.178324,-0.055907,230.348632,221.056942,-0.059139
107044,ENSMUST00000227654.2,App,60.697626,111.940214,-0.872272,230.348632,221.056942,-0.059139
107307,ENSMUST00000227990.2,App,2.09847,1.92949,0.080906,230.348632,221.056942,-0.059139
106563,ENSMUST00000227021.2,App,0.361027,0.205599,0.174945,230.348632,221.056942,-0.059139
107113,ENSMUST00000227737.2,App,42.378974,1.070325,4.389067,230.348632,221.056942,-0.059139
107102,ENSMUST00000227723.2,App,20.140481,18.63449,0.106618,230.348632,221.056942,-0.059139


In [9]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [31]:
important_genes = ['Mc3r', "Agrp", "Npy", "Nr5a1", 
    "Pomc", "Mc4r", "Gnrhr", "Gnrh1", "Ghrh", 
    "Ghrhr", "Sst", "Esr1","Lepr", "Pnoc",
    "Adcyap1","Foxp2", "Dlk1","Kiss1", "Tac2", 
    "Pdyn", "Rprm","Oxtr","Drd2","Gria3", 
    "Grik1", "Grik3", "Gabra5", "Gabra2", 
    "Syndig1","Pak3","Ptk2b", "Plk2",
    "Trh","Glp1r",
]


In [None]:
break

In [None]:
break