In [1]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
import pyranges as pr
import psutil

# import the script
source_path = os.path.abspath("../scripts/")
sys.path.append(source_path)
import make_anndata as mk

In [2]:
pore_c_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/pore_c/population_mESC.read_level.parquet"
resolution = 1000000
chrom_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/chrom_sizes.csv"
gene_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"

In [3]:
# Load the Pore-C data
mk.print_section_header("Loading Pore-C Data")
df = pd.read_parquet(pore_c_path)
df['value'] = 1
mk.print_data_shape("Pore-C data", df.shape)
mk.print_memory_usage("Load Pore-C data")

n = df.shape[0]

------------------------------------------------------------
------------------- Loading Pore-C Data --------------------
------------------------------------------------------------
Pore-C data shape:                  (14877807, 13)
RAM usage at step 'Load Pore-C data': 2.44 GB


In [4]:
# Load the chromosome table
mk.print_section_header("Creating Chromosome Intervals")
chrom, intervals = mk.create_chromosome_intervals(chrom_path, base_resolution=resolution)
mk.print_data_shape("Chromosome intervals", intervals.shape)

------------------------------------------------------------
-------------- Creating Chromosome Intervals ---------------
------------------------------------------------------------
intervals.shape=(2642, 6)
Chromosome intervals shape:         (2642, 6)


In [5]:
intervals[intervals['chrom'] == '2'].head()

Unnamed: 0,chrom,start,end,bin,chrom_bin,bin_name
196,2,0,1000000,196,0,chr2:0
197,2,1000000,2000000,197,1,chr2:1
198,2,2000000,3000000,198,2,chr2:2
199,2,3000000,4000000,199,3,chr2:3
200,2,4000000,5000000,200,4,chr2:4


In [None]:
def merge_genes(df, gene_path):
    """Merges a DataFrame with gene information from a Parquet file.

    Args:
      df: DataFrame with columns 'chrom', 'ref_start', and 'ref_end'.
      gene_path: Path to the Parquet file containing gene data.

    Returns:
      A DataFrame with merged gene information.
    """
    gdf = pd.read_parquet(gene_path)

    # Convert DataFrames to PyRanges for efficient interval joining
    gdf_pr = pr.PyRanges(gdf)
    df_pr = pr.PyRanges(df.rename(columns={
        'chrom': 'Chromosome',
        'ref_start': 'Start',
        'ref_end': 'End',
    }))

    # Perform the join operation with specified parameters
    df = df_pr.join(
        gdf_pr,
        strandedness=None,
        how='left',
        report_overlap=True,
    ).df.rename(columns={
        'Chromosome': 'chrom',
        'Start': 'ref_start',
        'End': 'ref_end',
        'Start_b': 'gene_start',
        'End_b': 'gene_end',
        'length': 'gene_length',
        'Overlap': 'overlap',
    })

    # take only the best overlap - since there are overlapping genes 
    df = df.sort_values(by='overlap', ascending=False)
    df = df.drop_duplicates(subset=['read_name', 'read_start', 'ref_start', 'basename'], keep='first')

    # Ensure correct data types
    df['is_tf'] = df['is_tf'].astype(bool)
    df['is_pt_gene'] = (df['gene_biotype'] == 'protein_coding')
    return df

df = merge_genes(df, gene_path)
print(f"{df.shape=}")
df.head()

In [None]:
n - df.shape[0]

In [None]:
break

In [None]:
break

In [None]:
df.columns

In [None]:
df['duplicate_maps'] = df.groupby(['read_name', 'read_start'])['gene_name'].transform('nunique')
test = df[df['duplicate_maps'] > 1]
test = test.sort_values(by=['read_name', 'ref_start'])
print(f"{test.shape=}")
test[['read_name', 'read_start', 'ref_start', 'chrom', 'gene_name', 'gene_biotype', 'gene_start']].head()

In [None]:
break

In [None]:
df['gene_name'].value_counts()

In [None]:
df['read_name'].value_counts()

In [None]:
gdf = pd.read_parquet(gene_path)
print(f"{gdf.shape=}")
gdf.head()

# gdf_pr = pr.PyRanges(gdf)


In [None]:
df.head()