In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# local imports
import utils as ut
import plotting as plt2

# load the pangload markers

In [2]:
fpath = "../resources/PanglaoDB_Augmented_2021.txt"
pang = ut.load_pathway(fpath)  # Assuming 'ut.load_pathway' is a custom function to load the data

# Get gene sets
fb_genes = pang[pang['Fibroblasts']].index
hsc_genes = pang[pang['Hematopoietic Stem Cells']].index

# Print statements with clear descriptions
print(f"Number of genes for Fibroblasts: {len(fb_genes)}")
print(f"Number of genes for Hematopoietic Stem Cells: {len(hsc_genes)}")

Number of genes for Fibroblasts: 232
Number of genes for Hematopoietic Stem Cells: 178


# Load isoforms

In [None]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/scanpy/merged_isoforms.h5ad"
adata = sc.read_h5ad(fpath)
sc.logging.print_memory_usage()
adata

In [None]:
adata.var.sort_values(by='gene_count', ascending=False).head()

In [None]:
# Highly expressed transcripts
tdf = adata.var.copy()
tdf = tdf.sort_values(by='transcript_count', ascending=False)
tdf.head(15)

# depth differences

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 2, 2.5

sns.boxplot(
    data=adata.obs,
    x='dataset',
    y='n_genes',
    hue='dataset',
    width=0.5,
    linecolor='k',
)

sns.despine()

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 2, 2.5

sns.boxplot(
    data=adata.obs,
    x='dataset',
    y='total_counts',
    hue='dataset',
    width=0.5,
    linecolor='k',
)

sns.despine()

# distribution of non-zero transcripts

In [None]:
for layer in adata.layers.keys():
    
    df = adata.to_df(layer=layer)
    
    df = pd.DataFrame({
        'barcode' : df.index,
        'dataset' : adata.obs['dataset'].values,
        'sum_counts' : df.sum(axis=1).values,
        'mean_counts' : df.mean(axis=1).values,
        'percent_nz' : (df != 0).sum(axis=1) / df.shape[1],
    })
    
    print(f"{layer=} {df.shape=}")
    # print(df.head())
    
    plt.rcParams['figure.dpi'] = 200
    plt.rcParams['figure.figsize'] = 6.5, 2.5
    fig, axs = plt.subplots(1, 3)
    
    columns = [
        'sum_counts',
        'mean_counts',
        'percent_nz',
    ]
    
    for i, col in enumerate(columns):    
        sns.boxplot(
            data=df,
            x='dataset',
            y=col,
            hue='dataset',
            width=0.35,
            linecolor='k',
            ax=axs[i]
        )
        
    plt.tight_layout()
    plt.suptitle(layer, y=1.1)
    sns.despine()
    plt.show()

# Some highly expressed transcripts

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 4, 5
    
sc.pl.highest_expr_genes(
    adata,
)

# highly expressed genes

In [None]:
layer = 'raw_counts'
exp_df = adata.to_df(layer=layer)
exp_df['group'] = adata.obs['dataset'].values
exp_df = exp_df.groupby('group').sum().T
exp_df['transcript_count'] = exp_df.sum(axis=1)
exp_df = exp_df.reset_index()

exp_df['gene_name'] = exp_df['transcript_name'].apply(lambda x: x.split("-")[0])

cols = ['iHSC', 'scFib']
exp_df[['iHSC_gene_counts', 'scFib_gene_counts']] = exp_df.groupby('gene_name')[cols].transform('sum')
exp_df = exp_df.set_index('transcript_name')
exp_df = exp_df.reset_index()

exp_df = exp_df.sort_values(by='transcript_count', ascending=False)

# some exlusions
exp_df = exp_df[~exp_df['gene_name'].str.startswith('RP')]
exp_df = exp_df[~exp_df['gene_name'].str.startswith('MT')]
exp_df = exp_df[~exp_df['gene_name'].str.startswith('S100')]

exp_df.head()

In [None]:
n_plot = 35
columns = [
    'transcript_name',
    'iHSC',
    'scFib',
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

exp_df[columns].head(n_plot).plot(
    x='transcript_name', 
    kind='bar', 
    stacked=True, 
    color=['deepskyblue', 'firebrick'],
    ec='k',
    zorder=2,
)

plt.grid(True, c='lightgrey', zorder=0)


plt.ylabel('Raw Counts')
plt.xlabel('')
sns.despine()

sns.move_legend(
    plt.gca(),
    title='',
    loc='best',
)


# Highly expressed HSC genes

In [None]:
pdf = exp_df[exp_df['gene_name'].isin(hsc_genes)].copy()

n_plot = 35
columns = [
    'transcript_name',
    'iHSC',
    'scFib',
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

pdf[columns].head(n_plot).plot(
    x='transcript_name', 
    kind='bar', 
    stacked=True, 
    color=['deepskyblue', 'firebrick'],
    ec='k',
    zorder=2,
)

plt.grid(True, c='lightgrey', zorder=0)


plt.ylabel('Raw Counts')
plt.xlabel('')
sns.despine()

sns.move_legend(
    plt.gca(),
    title='',
    loc='best',
)

In [None]:
pdf = exp_df.copy()

pdf = pdf[pdf['gene_name'].isin(fb_genes)]
# pdf = pdf.sort_values(by='iHSC', ascending=False)


n_plot = 35
columns = [
    'transcript_name',
    'iHSC',
    'scFib',
]

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

pdf[columns].head(n_plot).plot(
    x='transcript_name', 
    kind='bar', 
    stacked=True, 
    color=['deepskyblue', 'firebrick'],
    ec='k',
    zorder=2,
)

plt.grid(True, c='lightgrey', zorder=0)


plt.ylabel('Raw Counts')
plt.xlabel('')
sns.despine()

sns.move_legend(
    plt.gca(),
    title='',
    loc='best',
)

In [None]:
break

In [None]:
# 

adata

In [None]:
break

# load in the Chen 2014 Data

In [None]:
fpath = "../resources/chen_2014_isoforms.csv"
df = pd.read_csv(fpath)
print(f"{df.shape=}")
df = df[df['Model'] == 'HSC']
df = df[df['Gene_biotype'] == 'protein_coding']
df = df.sort_values(by='log(FC)', ascending=False)
print(f"{df.shape=}")

df[['External_gene_id', 'Model posterior probability']].head(20)

In [None]:
hsc_pres = [x for x in hsc_genes if x in tdf['gene_name'].values]

exp_df = tdf[tdf['gene_name'].isin(hsc_pres)]
exp_df = exp_df[exp_df['transcript_percent'] < 0.9]
exp_df.head(15)

In [None]:
gene = 'PTGS1'
layer = 'magic'
transcript_list = get_transcripts(gene, adata=adata)

for transcript in transcript_list:
    plt.rcParams['figure.dpi'] = 200
    plt.rcParams['figure.figsize'] = 5.5, 5
    plot_transcript(adata, transcript, layer)
    # break