In [1]:
import os
import sys
import pandas as pd
import glob

In [2]:
dpath = "/scratch/indikar_root/indikar1/shared_data/population/align_table/"
file_list = glob.glob(f"{dpath}*")
file_list


['/scratch/indikar_root/indikar1/shared_data/population/align_table/batch04.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/population/align_table/batch02.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/population/align_table/batch03.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/population/align_table/batch01.GRCm39.align_table.parquet']

In [37]:
chrom_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/chrom_sizes.csv"
chrom_df = pd.read_csv(chrom_path)
chroms = chrom_df['chrom'].to_list()
chrom_starts = dict(zip(chrom_df['chrom'].values, chrom_df['bp_start'].values))

In [50]:
read_columns = [
    'read_name',
    'chrom', 
    'ref_start', 
    'ref_end',
    'is_mapped',
]
     
result = []
for fpath in file_list:
    basename = os.path.basename(fpath).split(".")[0]
    df = pd.read_parquet(fpath, columns=read_columns)

    # Filtering & Transformations
    df = (
        df[df['is_mapped']]
        .loc[df['chrom'].isin(chroms)]
        .assign(
            basename        = basename,
            local_position  = lambda df: (((df['ref_end'] - df['ref_start']) // 2) + df['ref_start']).astype(int),
            chrom_start     = lambda df: df['chrom'].map(chrom_starts),
            global_position = lambda df: df['chrom_start'].astype(int) + df['local_position'].astype(int),
        )
        .dropna(subset=['global_position'])
        .drop_duplicates()
        .drop(columns=['is_mapped', 'chrom_start'])
    )
    
    df['ref_start'] = df['ref_start'].astype(int)
    df['ref_end'] = df['ref_end'].astype(int)
    
    # calculate order and drop singletons efficiently
    df['order'] = df.groupby('read_name')['chrom'].transform('count')
    df = df[df['order'] > 1]
    result.append(df)
    
result = pd.concat(result)
print(f"{result.shape=}")
result.head()

df.shape=(5914688, 8)
df.shape=(614841, 8)
df.shape=(3086265, 8)
df.shape=(5194444, 8)
result.shape=(14810238, 8)


Unnamed: 0,read_name,chrom,ref_start,ref_end,basename,local_position,global_position,order
0,00000202-49cf-47b2-83bf-5eb3f6d98373,10,79553913,79554361,batch04,79554137,1473898218,4
1,00000202-49cf-47b2-83bf-5eb3f6d98373,10,79553679,79553895,batch04,79553787,1473897868,4
2,00000202-49cf-47b2-83bf-5eb3f6d98373,10,79553496,79553677,batch04,79553586,1473897667,4
3,00000202-49cf-47b2-83bf-5eb3f6d98373,10,79552809,79553496,batch04,79553152,1473897233,4
6,0000131a-4f27-4dc5-839d-09720b024db9,4,45374671,45374875,batch04,45374773,582029385,4
