In [1]:
import numpy as np
import pandas as pd
import pyranges as pr
import gget
import glob
import os
import time
import psutil
import re
import seaborn as sns
import matplotlib.pyplot as plt
from importlib import reload

# Load gene mapping

In [2]:
# fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/geneTable.csv"
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell_fibroblast/references/geneTable.csv"

gdf = pd.read_csv(fpath, low_memory=False)

gdf = gdf[gdf['gene_biotype'] == 'protein_coding']
gdf = gdf[gdf['gene_name'].notna()]

gene_map = gdf.drop_duplicates().set_index('gene_id')['gene_name'].to_dict()
transcript_map = gdf.drop_duplicates().set_index('transcript_id')['transcript_name'].to_dict()

print(f"{len(gene_map)=}")
print(f"{len(transcript_map)=}")

len(gene_map)=19401
len(transcript_map)=167915


# Load read assignments

In [3]:
# dpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/isoquant/"
dpath = "/scratch/indikar_root/indikar1/shared_data/single_cell_fibroblast/isoquant/"

columns = ['isoform_id', 'gene_id']  # Removed redundant columns

# Define the extraction pattern
pattern = r'(?P<cell_id>\w+)_(?P<UMI>\w+)#(?P<read_name>[\w-]+).*'

df = []

print("Starting to process runs...")

for run_id in os.listdir(dpath):
    subdir_path = f"{dpath}{run_id}"
    if os.path.isdir(subdir_path) and run_id != 'output':
        read_path = f"{dpath}{run_id}/{run_id}.read_assignments.tsv.gz"

        start_time = time.time()  # Start timing for this run
        print(f"Loading data for run {run_id}...")

        tmp = pd.read_csv(read_path, sep='\t', skiprows=2, low_memory=False)

        print(f"Initial shape: {tmp.shape}, Memory Usage: {tmp.memory_usage(deep=True).sum() / (1024**3):.2f} GB")

        keep_types = [
            'unique',
            'unique_minor_difference',
        ]
        tmp = tmp[tmp['assignment_type'].isin(keep_types)][columns + ['#read_id']]
        tmp['run_id'] = run_id

        tmp[['cell_id', 'UMI', 'read_name']] = tmp['#read_id'].str.extract(pattern)
        tmp['read_name'] = tmp['read_name'].apply(lambda x: x.split("_")[0])
        
        # map genes 
        tmp['gene_name'] = tmp['gene_id'].map(gene_map)
        tmp['transcript_name'] = tmp['isoform_id'].map(transcript_map)

        # remove uneedded columns
        tmp = tmp.drop(columns=['#read_id'])
        
        # drop unmapped genes
        tmp = tmp[tmp['gene_name'].notna()]
        tmp = tmp[tmp['transcript_name'].notna()]

        print(f"Filtered shape: {tmp.shape}, Memory Usage: {tmp.memory_usage(deep=True).sum() / (1024**3):.2f} GB")

        df.append(tmp)

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Run {run_id} processed in {elapsed_time:.2f} seconds\n")

        # break  # Remove this if you want to process all runs

print("Concatenating results...")
df = pd.concat(df)

print(f"Concatenated shape: {df.shape}")

# dropping duplicates
df = df.drop_duplicates(subset=['isoform_id', 'gene_id', 'cell_id', 'UMI'])

print(f"Final shape: {df.shape}")
print(f"Final Memory Usage: {psutil.virtual_memory().used / (1024**3):.2f} GB")
print(df.head().to_string(index=False))  # Nicer formatting for the output

Starting to process runs...
Loading data for run run1...
Initial shape: (8564283, 9), Memory Usage: 6.86 GB
Filtered shape: (2920885, 8), Memory Usage: 1.57 GB
Run run1 processed in 65.02 seconds

Loading data for run Run4b...
Initial shape: (2159243, 9), Memory Usage: 1.72 GB
Filtered shape: (693159, 8), Memory Usage: 0.37 GB
Run Run4b processed in 15.43 seconds

Loading data for run Run6a...
Initial shape: (1570244, 9), Memory Usage: 1.25 GB
Filtered shape: (504017, 8), Memory Usage: 0.27 GB
Run Run6a processed in 11.96 seconds

Loading data for run Run6c...
Initial shape: (1858506, 9), Memory Usage: 1.48 GB
Filtered shape: (595705, 8), Memory Usage: 0.32 GB
Run Run6c processed in 13.71 seconds

Loading data for run Run4a...
Initial shape: (6073985, 9), Memory Usage: 4.83 GB
Filtered shape: (1954435, 8), Memory Usage: 1.05 GB
Run Run4a processed in 46.31 seconds

Loading data for run Run4c...
Initial shape: (2445957, 9), Memory Usage: 1.95 GB
Filtered shape: (780388, 8), Memory Usage

In [4]:
# break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Save the output

In [5]:
outpath = "/scratch/indikar_root/indikar1/shared_data/single_cell_fibroblast/isoforms/isoform_data.parquet"
df.to_parquet(outpath, index=False)
df.head()


Unnamed: 0,isoform_id,gene_id,run_id,cell_id,UMI,read_name,gene_name,transcript_name
350,ENST00000327044,ENSG00000188976,run1,AAGCATGGTTTACATG,TGTAAGCTGGAC,68e25e7f-74ed-45a7-a7cd-364ab6c81bf4,NOC2L,NOC2L-201
352,ENST00000483767,ENSG00000188976,run1,GGCGGTTCATGATCGG,TTCAAATTGTAG,3a8fa08c-1a08-4457-93c2-ecc7b1de6190,NOC2L,NOC2L-204
353,ENST00000327044,ENSG00000188976,run1,AAGGTTAAGCTAATGT,AAGTCTATATGC,f3bf3e2e-1b6d-4476-b89d-e17993d6a355,NOC2L,NOC2L-201
354,ENST00000483767,ENSG00000188976,run1,TCACACGGTAATCCGT,ATCTAATATACT,282ae65a-0b4e-4d6f-9759-642cfb11e3c3,NOC2L,NOC2L-204
355,ENST00000327044,ENSG00000188976,run1,CTAACCAAGGTGACAG,ACGCATAGATAT,8588b18f-27af-4ff6-8597-4e6b7bdf4e84,NOC2L,NOC2L-201


In [None]:
break