In [1]:
import numpy as np
import pandas as pd
import pyranges as pr
import gget
import glob
import os
import time
import psutil
import re
import seaborn as sns
import matplotlib.pyplot as plt
from importlib import reload

# Load gene mapping

In [2]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/geneTable.csv"

gdf = pd.read_csv(fpath, low_memory=False)

gdf = gdf[gdf['gene_biotype'] == 'protein_coding']
gdf = gdf[gdf['gene_name'].notna()]

gene_map = gdf.drop_duplicates().set_index('gene_id')['gene_name'].to_dict()
transcript_map = gdf.drop_duplicates().set_index('transcript_id')['transcript_name'].to_dict()

print(f"{len(gene_map)=}")
print(f"{len(transcript_map)=}")

len(gene_map)=19401
len(transcript_map)=167915


# Load read assignments

In [3]:
dpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/isoquant/"

columns = ['isoform_id', 'gene_id']  # Removed redundant columns

# Define the extraction pattern
pattern = r'(?P<cell_id>\w+)_(?P<UMI>\w+)#(?P<read_name>[\w-]+).*'

df = []

print("Starting to process runs...")

for run_id in os.listdir(dpath):
    subdir_path = f"{dpath}{run_id}"
    if os.path.isdir(subdir_path) and run_id != 'output':
        read_path = f"{dpath}{run_id}/{run_id}.read_assignments.tsv.gz"

        start_time = time.time()  # Start timing for this run
        print(f"Loading data for run {run_id}...")

        tmp = pd.read_csv(read_path, sep='\t', skiprows=2, low_memory=False)

        print(f"Initial shape: {tmp.shape}, Memory Usage: {tmp.memory_usage(deep=True).sum() / (1024**3):.2f} GB")

        tmp = tmp[tmp['assignment_type'] == 'unique'][columns + ['#read_id']]
        tmp['run_id'] = run_id

        tmp[['cell_id', 'UMI', 'read_name']] = tmp['#read_id'].str.extract(pattern)
        tmp['read_name'] = tmp['read_name'].apply(lambda x: x.split("_")[0])
        
        # map genes 
        tmp['gene_name'] = tmp['gene_id'].map(gene_map)
        tmp['transcript_name'] = tmp['isoform_id'].map(transcript_map)

        # remove uneedded columns
        tmp = tmp.drop(columns=['#read_id'])
        
        # drop unmapped genes
        tmp = tmp[tmp['gene_name'].notna()]
        tmp = tmp[tmp['transcript_name'].notna()]

        print(f"Filtered shape: {tmp.shape}, Memory Usage: {tmp.memory_usage(deep=True).sum() / (1024**3):.2f} GB")

        df.append(tmp)

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Run {run_id} processed in {elapsed_time:.2f} seconds\n")

        # break  # Remove this if you want to process all runs

print("Concatenating results...")
df = pd.concat(df)

print(f"Concatenated shape: {df.shape}")

# dropping duplicates
df = df.drop_duplicates(subset=['isoform_id', 'gene_id', 'cell_id'])

print(f"Final shape: {df.shape}")
print(f"Final Memory Usage: {psutil.virtual_memory().used / (1024**3):.2f} GB")
print(df.head().to_string(index=False))  # Nicer formatting for the output

Starting to process runs...
Loading data for run Run3x0418...
Initial shape: (1030084, 9), Memory Usage: 0.84 GB
Filtered shape: (125453, 8), Memory Usage: 0.07 GB
Run Run3x0418 processed in 7.23 seconds

Loading data for run P2r8a...
Initial shape: (4150537, 9), Memory Usage: 3.27 GB
Filtered shape: (1149118, 8), Memory Usage: 0.62 GB
Run P2r8a processed in 31.04 seconds

Loading data for run Run5x0314...
Initial shape: (99393, 9), Memory Usage: 0.08 GB
Filtered shape: (19497, 8), Memory Usage: 0.01 GB
Run Run5x0314 processed in 0.77 seconds

Loading data for run Gridr5...
Initial shape: (7168486, 9), Memory Usage: 5.69 GB
Filtered shape: (1386851, 8), Memory Usage: 0.75 GB
Run Gridr5 processed in 52.48 seconds

Loading data for run Gridr4...
Initial shape: (13947275, 9), Memory Usage: 11.45 GB
Filtered shape: (2007101, 8), Memory Usage: 1.08 GB
Run Gridr4 processed in 103.95 seconds

Loading data for run Run6x0314...
Initial shape: (102506, 9), Memory Usage: 0.08 GB
Filtered shape: (

# Save the output

In [4]:
outpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/isoforms/isoform_data.parquet"
df.to_parquet(outpath, index=False)
df.head()


Unnamed: 0,isoform_id,gene_id,run_id,cell_id,UMI,read_name,gene_name,transcript_name
53,ENST00000483767,ENSG00000188976,Run3x0418,TGTGGCGGTAACGGTG,AAGTTTACCGGG,b43c7b93-2130-42e0-a51d-2819bedeb593,NOC2L,NOC2L-204
54,ENST00000483767,ENSG00000188976,Run3x0418,CTAACTTGTCGTCATA,TTTTGGATCTTT,5887c2fa-b3bd-4a22-b3f8-4ce31b4e9d0d,NOC2L,NOC2L-204
55,ENST00000483767,ENSG00000188976,Run3x0418,AAGATAGAGTTGGGAC,AACGGGTCCTCT,050743c4-d895-45d6-b59a-c5232b6ac1b1,NOC2L,NOC2L-204
65,ENST00000469563,ENSG00000188976,Run3x0418,TATACCTAGTCGGCCT,AACGGTTAGCCG,f24c150f-2a5a-4581-96a7-081d004d28c6,NOC2L,NOC2L-202
66,ENST00000469563,ENSG00000188976,Run3x0418,TGTAAGCGTGTCATTG,AAACTCTAACTT,166e19dc-cd7c-4ee2-b35e-eb3ceb455285,NOC2L,NOC2L-202


In [5]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)