In [8]:
import numpy as np
import pandas as pd
import pyranges as pr
import gget
import glob
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable

In [20]:
def load_gtf(fpath: str = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/geneTable.csv") -> pd.DataFrame:
    """
    Loads a gene table CSV file from the specified path, 
    cleans the data, and returns a Pandas DataFrame.

    Args:
        fpath: The path to the CSV file.

    Returns:
        A cleaned pandas DataFrame containing the gene data.
    """

    columns = [
        'transcript_id',
        'transcript_name',
        'transcript_biotype',
        'gene_id',
        'gene_name',
        'gene_biotype',
    ]

    # Read only the necessary columns
    gdf = pd.read_csv(fpath, low_memory=False, usecols=columns)

    # Drop rows with NaN values in key columns and remove duplicates
    gdf = gdf.dropna(subset=["transcript_id", "gene_id"]).drop_duplicates()

    return gdf


fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/geneTable.csv"
gdf = load_gtf(fpath)
print(f"{gdf.shape=}")
gdf.head()

gdf.shape=(251121, 6)


Unnamed: 0,gene_id,gene_name,gene_biotype,transcript_id,transcript_name,transcript_biotype
1,ENSG00000160072,ATAD3B,protein_coding,ENST00000673477,ATAD3B-206,protein_coding
38,ENSG00000160072,ATAD3B,protein_coding,ENST00000472194,ATAD3B-203,retained_intron
53,ENSG00000160072,ATAD3B,protein_coding,ENST00000378736,ATAD3B-202,processed_transcript
58,ENSG00000160072,ATAD3B,protein_coding,ENST00000485748,ATAD3B-205,retained_intron
69,ENSG00000160072,ATAD3B,protein_coding,ENST00000474481,ATAD3B-204,retained_intron


In [18]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/isoquant/Gridr3/Gridr3.gene_counts.tsv"

df = pd.read_csv(fpath, 
                 sep="\t", 
                 names=['gene_id', 'gene_count'], 
                 comment="#")

df = df.set_index('gene_id')
df.head()

Unnamed: 0_level_0,gene_count
gene_id,Unnamed: 1_level_1
ENSG00000000457,25.0
ENSG00000000460,4.0
ENSG00000000938,1.0
ENSG00000000971,90.0
ENSG00000001460,47.0


In [5]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
trx = gdf[['transcript_id', 'transcript_name']]
trx = trx.drop_duplicates()
trx = trx[trx['transcript_id'].notna()]

trans_map = dict(zip(trx['transcript_id'].values, trx['transcript_name'].values))

trx.head()

In [None]:
break

In [None]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/isoquant/Gridr3/Gridr3.transcript_tpm.tsv"
df = pd.read_csv(fpath, sep='\t')
df['transcript_name'] = df['#feature_id'].map(trans_map)
df['gene_name'] = df['transcript_name'].apply(lambda x: "-".join(str(x).split("-")[:-1]))

# drop zeros
df = df[df['TPM'] > 0]
df['gene_var'] = df.groupby('gene_name')['TPM'].transform('std')
df['tx_count'] = df.groupby('gene_name')['TPM'].transform('count')

# drop single isoform genes
df = df[df['tx_count'] > 1]

# drop mitchondrial amnd ribosomal genes
df = df[~df['gene_name'].str.startswith("MT")]
df = df[~df['gene_name'].str.startswith("RP")]

df = df.sort_values(by='gene_var', ascending=False)

df.head(20)


# HSC Genes

In [None]:
genes = pd.read_csv("~/git_repositories/ONT-single-cell/config/gene_annotations/panglaodb.csv")
genes = genes[genes['cell_type'] == 'Hematopoietic stem cells']
genes = genes['gene_name'].values
print(len(genes))

In [None]:
pdf = df.copy()
pdf = pdf[pdf['gene_name'].isin(genes)]
pdf['gene_TPM'] = pdf.groupby('gene_name')['TPM'].transform('sum')
pdf['gene_TPM_log'] = np.log1p(pdf['gene_TPM'])
pdf['percent'] = (pdf['TPM'] / pdf['gene_TPM']) * 100
print(f"{pdf.shape=}")

pdf['isoform_name'] = pdf['transcript_name'].apply(lambda x: x.split("-")[1])

table = pd.pivot_table(pdf, 
                     index='gene_name', 
                     columns='isoform_name',
                     values='percent',
                     fill_value=0
                    )


plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 6, 5.25

table.plot(kind='bar', 
           stacked=True,
           width=0.75, 
           cmap='tab20b',
           ec='k')


sns.move_legend(plt.gca(), 
                loc='upper right',
                title='Isoform',
                bbox_to_anchor=(1.2, 1.01))

plt.ylabel("Percent of Expression")
plt.xlabel("")


ax_divider = make_axes_locatable(plt.gca())
ax2 = ax_divider.append_axes("top", 
                             size="25%", 
                             pad=f"5%",
                             sharex=plt.gca(),
                            )
                             

# get total expression
pdf = pdf[['gene_name', 'gene_TPM_log']].drop_duplicates()
pdf = pdf.sort_values(by='gene_name')

sns.barplot(data=pdf,
            x='gene_name', 
            y='gene_TPM_log',
            ec='k',
            width=0.75,
            color='lightgrey',
            ax=ax2)

ax2.set_xticklabels([""]*len(pdf))
plt.xlabel("")
plt.ylabel('TPM (log)')

In [None]:
pdf = pdf[['gene_name', 'gene_TPM']].drop_duplicates()
pdf