In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import sys
import os
import pysam
import pyranges as pr
import rapidfuzz

import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LogNorm, Normalize

import utils as ut

In [2]:
# load marker gene sets
fpath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/panglaodb/pandb.tsv.gz"
celltypes = [
    'Hematopoietic stem cells',
    'Fibroblasts',
]
cell_names = [
    'Fib',
    'HSC'
]
markers = ut.get_pangloa(fpath, celltypes, cell_names)
print(f"{markers.shape=}")
markers.head()

markers.shape=(263, 3)


Unnamed: 0,gene_name,Fib,HSC
0,ABCB1,0.0,1.0
1,ABCG2,0.0,1.0
2,ABI3,1.0,0.0
3,ACE,0.0,1.0
4,ACKR3,1.0,0.0


In [3]:
# load the GTF
gtf_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/annotations.gtf"
gf = pr.read_gtf(gtf_path)
gdf = gf.df
print(f"{gdf.shape=}")
print(gdf.columns)
gdf.head()

gdf.shape=(3371244, 26)
Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_version', 'gene_name', 'gene_source',
       'gene_biotype', 'transcript_id', 'transcript_version',
       'transcript_name', 'transcript_source', 'transcript_biotype', 'tag',
       'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id',
       'protein_version', 'transcript_support_level'],
      dtype='object')


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_version,...,transcript_source,transcript_biotype,tag,ccds_id,exon_number,exon_id,exon_version,protein_id,protein_version,transcript_support_level
0,1,ensembl_havana,gene,1471764,1497848,.,+,.,ENSG00000160072,20,...,,,,,,,,,,
1,1,ensembl_havana,transcript,1471764,1497848,.,+,.,ENSG00000160072,20,...,ensembl_havana,protein_coding,basic,CCDS30,,,,,,
2,1,ensembl_havana,exon,1471764,1472089,.,+,.,ENSG00000160072,20,...,ensembl_havana,protein_coding,basic,CCDS30,1.0,ENSE00003889014,1.0,,,
3,1,ensembl_havana,CDS,1471884,1472089,.,+,0,ENSG00000160072,20,...,ensembl_havana,protein_coding,basic,CCDS30,1.0,,,ENSP00000500094,1.0,
4,1,ensembl_havana,start_codon,1471884,1471887,.,+,0,ENSG00000160072,20,...,ensembl_havana,protein_coding,basic,CCDS30,1.0,,,,,


In [4]:
# build the version ids
tdf = gdf[gdf['Feature'] == 'transcript']

columns = [
    'gene_name',
    'gene_biotype',
    'Chromosome', 
    'Start', 
    'End',
    'transcript_name', 
    'transcript_biotype',
    'transcript_id',
    'transcript_version',
]

tdf = tdf[columns].drop_duplicates()
tdf['tid'] = tdf['transcript_id'] + "." + tdf['transcript_version']
tdf['transcript_biotype_clean'] = tdf['transcript_biotype'].apply(lambda x: x.replace("_", " "))
print(f"{tdf.shape=}")
tdf.head()

tdf.shape=(251121, 11)


Unnamed: 0,gene_name,gene_biotype,Chromosome,Start,End,transcript_name,transcript_biotype,transcript_id,transcript_version,tid,transcript_biotype_clean
1,ATAD3B,protein_coding,1,1471764,1497848,ATAD3B-206,protein_coding,ENST00000673477,1,ENST00000673477.1,protein coding
38,ATAD3B,protein_coding,1,1478025,1497848,ATAD3B-203,retained_intron,ENST00000472194,6,ENST00000472194.6,retained intron
53,ATAD3B,protein_coding,1,1479048,1482662,ATAD3B-202,processed_transcript,ENST00000378736,3,ENST00000378736.3,processed transcript
58,ATAD3B,protein_coding,1,1483484,1496202,ATAD3B-205,retained_intron,ENST00000485748,5,ENST00000485748.5,retained intron
69,ATAD3B,protein_coding,1,1484568,1496201,ATAD3B-204,retained_intron,ENST00000474481,1,ENST00000474481.1,retained intron


In [5]:
# load the nanocount data
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/nanocount/merged/merged_tx_counts.tsv"
df = pd.read_csv(fpath, sep='\t')
print(f"{df.shape=}")
df = df.rename(columns={'transcript_name' : 'tid'})
df.head()

df.shape=(178136, 5)


Unnamed: 0,tid,raw,est_count,tpm,transcript_length
0,ENST00000368719.8,0.029819,1424769.0,29819.135395,665
1,ENST00000245185.5,0.019882,949983.9,19882.314105,786
2,ENST00000331825.10,0.017566,839294.0,17565.673653,878
3,ENST00000501597.3,0.016534,790011.0,16534.224764,469
4,ENST00000496817.5,0.015537,742382.4,15537.400447,673


In [6]:
# merge 'em
pdf = pd.merge(df, tdf, 
               how='left',
               left_on='tid',
               right_on='tid')

# drop unnamed genes
pdf = pdf[pdf['gene_name'].notna()]
pdf = pdf.sort_values(by=['gene_name', 'transcript_name'])

# add percent of total transcription
pdf['n_isoforms'] = pdf.groupby('gene_name')['transcript_name'].transform('nunique')
pdf['gene_tpm'] = pdf.groupby('gene_name')['tpm'].transform('sum')
pdf['tx_prc'] = pdf['tpm'] / pdf['gene_tpm']
pdf['tx_prc'] = pdf['tx_prc'].fillna(0)

print(f"{pdf.shape=}")
pdf.head()

pdf.shape=(127608, 18)


Unnamed: 0,tid,raw,est_count,tpm,transcript_length,gene_name,gene_biotype,Chromosome,Start,End,transcript_name,transcript_biotype,transcript_id,transcript_version,transcript_biotype_clean,n_isoforms,gene_tpm,tx_prc
1088,ENST00000595014.1,8.041462e-05,3842.238755,80.414625,2301,A1BG,protein_coding,19,58346857.0,58353491.0,A1BG-202,retained_intron,ENST00000595014,1,retained intron,4,160.577631,0.500783
73205,ENST00000596924.1,4.185821e-08,2.0,0.041858,2134,A1BG,protein_coding,19,58345177.0,58347634.0,A1BG-203,processed_transcript,ENST00000596924,1,processed transcript,4,160.577631,0.000261
1092,ENST00000598345.1,8.012115e-05,3828.216344,80.121148,475,A1BG,protein_coding,19,58346859.0,58347657.0,A1BG-204,retained_intron,ENST00000598345,1,retained intron,4,160.577631,0.498956
167271,ENST00000600966.1,0.0,0.0,0.0,917,A1BG,protein_coding,19,58350593.0,58353129.0,A1BG-205,protein_coding,ENST00000600966,1,protein coding,4,160.577631,0.0
92853,ENST00000282641.6,6.976369e-09,0.333333,0.006976,9350,A1CF,protein_coding,10,50799420.0,50885675.0,A1CF-201,protein_coding,ENST00000282641,6,protein coding,9,0.020929,0.333333


In [7]:
outpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/isoforms/HSC.nanocount.csv"
pdf.to_csv(outpath, index=False)
pdf.head()

Unnamed: 0,tid,raw,est_count,tpm,transcript_length,gene_name,gene_biotype,Chromosome,Start,End,transcript_name,transcript_biotype,transcript_id,transcript_version,transcript_biotype_clean,n_isoforms,gene_tpm,tx_prc
1088,ENST00000595014.1,8.041462e-05,3842.238755,80.414625,2301,A1BG,protein_coding,19,58346857.0,58353491.0,A1BG-202,retained_intron,ENST00000595014,1,retained intron,4,160.577631,0.500783
73205,ENST00000596924.1,4.185821e-08,2.0,0.041858,2134,A1BG,protein_coding,19,58345177.0,58347634.0,A1BG-203,processed_transcript,ENST00000596924,1,processed transcript,4,160.577631,0.000261
1092,ENST00000598345.1,8.012115e-05,3828.216344,80.121148,475,A1BG,protein_coding,19,58346859.0,58347657.0,A1BG-204,retained_intron,ENST00000598345,1,retained intron,4,160.577631,0.498956
167271,ENST00000600966.1,0.0,0.0,0.0,917,A1BG,protein_coding,19,58350593.0,58353129.0,A1BG-205,protein_coding,ENST00000600966,1,protein coding,4,160.577631,0.0
92853,ENST00000282641.6,6.976369e-09,0.333333,0.006976,9350,A1CF,protein_coding,10,50799420.0,50885675.0,A1CF-201,protein_coding,ENST00000282641,6,protein coding,9,0.020929,0.333333


In [8]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
test = pdf[pdf["gene_name"] == 'CASP1']
test

In [None]:
break

# look at FB genes

In [None]:
perc_lb = 0.2
perc_ub = 0.8

mgenes = markers[markers['Fib'] == 1]['gene_name'].values 
fdf = pdf[pdf['gene_name'].isin(mgenes)]
print(f"{fdf.shape=}")

fdf = fdf.sort_values(by='gene_tpm', ascending=False)

fdf.head(50)

In [None]:
query = 'HIF1A'

pdx = fdf[fdf['gene_name'] == query]
pdx = pdx.sort_values(by='transcript_name')


plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3, len(pdx) / 4
sns.barplot(data=pdx, 
            x='tx_prc',
            y='transcript_name',
            hue='transcript_biotype_clean',
            ec='k')

sns.despine(bottom=True)
plt.ylabel("")
plt.xlabel('Percent of Expression')
sns.move_legend(plt.gca(), 
                loc='upper right',
                title='Type',
                bbox_to_anchor=(2.3, 1))

# look at HSC genes

# look at repgromming TFs

In [None]:
gene_list = [
    'GATA2',
    'GFI1B',
    'FOS',
    'STAT5A',
    'REL',   
] 

fdf = pdf[pdf['gene_name'].isin(gene_list)]
fdf

In [None]:
query = 'GATA2'

pdx = fdf[fdf['gene_name'] == query]
pdx = pdx.sort_values(by='transcript_name')


plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3, len(pdx) / 4
sns.barplot(data=pdx, 
            x='tx_prc',
            y='transcript_name',
            hue='transcript_biotype_clean',
            ec='k')

sns.despine(bottom=True)
plt.ylabel("")
plt.xlabel('Percent of Expression')
sns.move_legend(plt.gca(), 
                loc='upper right',
                title='Type',
                bbox_to_anchor=(1.9, 1))