In [1]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
import pyranges as pr
import psutil

# import the script
source_path = os.path.abspath("../scripts/")
sys.path.append(source_path)
import make_anndata as mk

In [28]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/anndata/population_mESC_1000000_raw.h5ad"

start_time = time.time()  # Record the start time
adata = sc.read_h5ad(fpath)
end_time = time.time()  # Record the end time
print(f"Time taken to read the file: {end_time - start_time:.2f} seconds")
sc.logging.print_memory_usage()
adata

Time taken to read the file: 184.47 seconds
Memory usage: current 21.00 GB, difference +21.00 GB


AnnData object with n_obs × n_vars = 2579 × 2756467
    obs: 'bin_index', 'bin_start', 'bin_end', 'bin', 'chrom_bin', 'degree', 'genes', 'n_genes'
    var: 'read_index', 'basename', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp', 'genes', 'n_genes'
    uns: 'base_resolution', 'chrom_sizes', 'gdf', 'gene_map', 'intervals'
    layers: 'H'

In [None]:
break

In [2]:
pore_c_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/pore_c/population_mESC.read_level.parquet"
resolution = 1000000
chrom_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/chrom_sizes.csv"
gene_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"

In [3]:
# Load the Pore-C data
mk.print_section_header("Loading Pore-C Data")
df = pd.read_parquet(pore_c_path)
df['value'] = 1
mk.print_data_shape("Pore-C data", df.shape)
mk.print_memory_usage("Load Pore-C data")

n = df.shape[0]

------------------------------------------------------------
------------------- Loading Pore-C Data --------------------
------------------------------------------------------------
Pore-C data shape:                  (14877807, 13)
RAM usage at step 'Load Pore-C data': 2.41 GB


In [4]:
# Load the chromosome table
mk.print_section_header("Creating Chromosome Intervals")
chrom, intervals = mk.create_chromosome_intervals(chrom_path, base_resolution=resolution)
mk.print_data_shape("Chromosome intervals", intervals.shape)

------------------------------------------------------------
-------------- Creating Chromosome Intervals ---------------
------------------------------------------------------------
intervals.shape=(2642, 6)
Chromosome intervals shape:         (2642, 6)


In [5]:
intervals[intervals['chrom'] == '2'].head()

Unnamed: 0,chrom,start,end,bin,chrom_bin,bin_name
196,2,0,1000000,196,0,chr2:0
197,2,1000000,2000000,197,1,chr2:1
198,2,2000000,3000000,198,2,chr2:2
199,2,3000000,4000000,199,3,chr2:3
200,2,4000000,5000000,200,4,chr2:4


In [6]:
df = mk.merge_genes(df, gene_path)
print(f"{df.shape=}")
df.head()

df.shape=(14877807, 24)


Unnamed: 0,read_name,read_start,read_end,length_on_read,chrom,ref_start,ref_end,mapping_quality,basename,local_position,...,gene_name,gene_source,gene_biotype,gene_start,gene_end,gene_length,midpoint,is_tf,gene_overlap,is_pt_gene
8342354,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,0,4916,4916,9,121048825,121057172,60,batch01,121052998,...,Ulk4,ensembl_havana,protein_coding,120784416,121106263,321847,120945339,False,8347,True
14789315,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,6352,9540,3188,19,26583825,26592064,60,batch01,26587944,...,Smarca2,ensembl_havana,protein_coding,26582449,26755722,173273,26669085,False,8239,True
4110544,ad5b2240-893f-4ed0-a157-c2be66d8d754,0,1919,1919,4,127067225,127074760,60,batch04,127070992,...,Dlgap3,ensembl_havana,protein_coding,127062996,127130815,67819,127096905,False,7535,True
10733063,3f354c45-5e48-4f6d-8c7e-05369432b344,762,5998,5236,12,8680599,8685932,60,batch04,8683265,...,Gm56531,havana_tagene,lncRNA,8649709,8691034,41325,8670371,False,5333,False
13889148,d4626feb-16a2-4aac-8145-53e89b60bf7c,0,1821,1821,17,66243365,66248641,60,batch01,66246003,...,Twsg1,ensembl_havana,protein_coding,66228966,66258221,29255,66243593,False,5276,True


In [7]:
n - df.shape[0]

0

In [8]:
# Add the interval information
mk.print_section_header("Joining Intervals with Pore-C Data")
df = mk.join_intervals_pyranges(df, intervals)
df.head()

------------------------------------------------------------
------------ Joining Intervals with Pore-C Data ------------
------------------------------------------------------------
df.shape=(14877807, 32)


Unnamed: 0,read_name,read_start,read_end,length_on_read,chrom,ref_start,ref_end,mapping_quality,basename,local_position,...,gene_overlap,is_pt_gene,bin_start,bin_end,bin,chrom_bin,bin_name,bin_overlap,read_index,bin_index
7796001,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,0,4916,4916,9,121048825,121057172,60,batch01,121052998,...,8347,True,121000000,122000000,1394,121,chr9:121,8347,0,0
14148112,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,6352,9540,3188,19,26583825,26592064,60,batch01,26587944,...,8239,True,26000000,27000000,2436,26,chr19:26,8239,1,1
3788352,ad5b2240-893f-4ed0-a157-c2be66d8d754,0,1919,1919,4,127067225,127074760,60,batch04,127070992,...,7535,True,127000000,128000000,665,127,chr4:127,7535,2,2
7462945,1cbc7b8c-2ad8-4365-a57c-037fc41af923,1305,6701,5396,8,21218739,21225694,1,batch04,21222216,...,-21218740,False,21000000,22000000,1163,21,chr8:21,6955,3,3
7463448,9af4c1b9-a795-43a1-9669-55aff68e3b9d,426,5786,5360,8,21241017,21247152,60,batch03,21244084,...,-21241018,False,21000000,22000000,1163,21,chr8:21,6135,4,3


In [9]:
n - df.shape[0]

0

In [15]:
def create_var_df(df, var_names):
  """Creates a variable DataFrame from a DataFrame with read information.

  Args:
    df (pandas.DataFrame): DataFrame with read data including 'read_name', 'read_index', 
                           'mapping_quality', 'chrom', 'order', 'bin', and 'length_on_read'.
    var_names (pandas.Index): Index of unique read names.

  Returns:
    pandas.DataFrame: DataFrame containing variable information (read-level summaries).
  """
  var = df.copy()

  gene_list = lambda x: ";".join([i for i in set(x) if i != '-1'])
  n_genes = lambda x: len([i for i in set(x) if i != '-1'])

  var = var.groupby(['basename', 'read_name', 'read_index']).agg(
      mean_mapq=('mapping_quality', 'mean'),
      median_mapq=('mapping_quality', 'median'),
      n_chromosomes=('chrom', 'nunique'),
      order=('order', 'first'),
      n_bins=('bin', 'nunique'),
      read_length_bp=('length_on_read', 'sum'),
      genes=('gene_name', gene_list),
      n_genes=('gene_name', n_genes),
  ).reset_index()

  # Ensure proper sorting using var_names
  var = var.set_index('read_index')
  # var = var.reindex(var_names)
  # var = var.reset_index()
  # var = var.set_index('read_name')

  return var


var = create_var_df(df, "")
var.head()

Unnamed: 0_level_0,basename,read_name,mean_mapq,median_mapq,n_chromosomes,order,n_bins,read_length_bp,genes,n_genes
read_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2585712,batch01,00001034-f0ab-41c2-885a-da17c2b836ae,34.0,24.0,2,3,2,378,Grm1;Lhfpl4,2
2099807,batch01,00001afb-6b62-4d01-be02-9509485330ab,59.0,59.0,2,2,2,452,Adam12,1
1156249,batch01,00002230-8080-4260-9522-dc39258a0698,60.0,60.0,1,3,1,1695,Fggy;Gm830,2
146652,batch01,00002fb3-034c-4900-aaa9-ca1a08f4fb6b,52.333333,60.0,2,3,2,1892,Csnka2ip;Gnal,2
970720,batch01,00003344-980a-405c-b539-eb0341ada8d2,35.0,35.0,2,2,2,657,Upp2;Scfd2,2


In [20]:
def create_obs_df(df, obs_names):
  """Creates an observation DataFrame from a DataFrame and a sparse matrix.

  Args:
    df (pandas.DataFrame): DataFrame with genomic data and bin information.
    X (scipy.sparse.csr_matrix): Sparse matrix representation of the data.

  Returns:
    pandas.DataFrame: DataFrame containing observation information.
  """
  gene_list = lambda x: ";".join([i for i in set(x) if i != '-1'])
  n_genes = lambda x: len([i for i in set(x) if i != '-1'])
    
  obs = df.groupby('bin_name').agg(
      bin_start=('bin_start', 'first'),
      bin_end=('bin_end', 'first'),
      bin=('bin', 'first'),
      bin_index=('bin_index', 'first'),
      chrom_bin=('chrom_bin', 'first'),
      degree=('read_name', 'nunique'),
      genes=('gene_name', gene_list),
      n_genes=('gene_name', n_genes),
  ).reset_index()


  # Ensure proper sorting using var_names
  obs = obs.set_index('bin_index')
  obs = obs.reindex(obs_names)
  obs = obs.reset_index()
  obs = obs.set_index('bin_name')

  return obs

obs = create_obs_df(df, "")
obs.head()

Unnamed: 0,bin_name,bin_start,bin_end,bin,bin_index,chrom_bin,degree,genes,n_genes
0,chr10:10,10000000,11000000,1408,881,10,3356,Adgb;Gm48324;Grm1;4930567K20Rik;Gm48406;Gm4827...,13
1,chr10:100,100000000,101000000,1498,1431,100,3109,Gm47956;Tmtc3;Gm47631;Gm4301;1700017N19Rik;Gm8...,23
2,chr10:101,101000000,102000000,1499,1505,101,2865,Mgat4c;Gm19233;Gm26278,3
3,chr10:102,102000000,103000000,1500,506,102,3055,Gm47033;Mgat4c;Rassf9;Gm26988;Gm5175;Gm47101;G...,11
4,chr10:103,103000000,104000000,1501,446,103,2988,Gm26923;Gm47228;Gm47221;Gm47224;Gm21293;Gm4722...,14


In [10]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [21]:
df.head()

Unnamed: 0,read_name,read_start,read_end,length_on_read,chrom,ref_start,ref_end,mapping_quality,basename,local_position,...,gene_overlap,is_pt_gene,bin_start,bin_end,bin,chrom_bin,bin_name,bin_overlap,read_index,bin_index
7796001,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,0,4916,4916,9,121048825,121057172,60,batch01,121052998,...,8347,True,121000000,122000000,1394,121,chr9:121,8347,0,0
14148112,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,6352,9540,3188,19,26583825,26592064,60,batch01,26587944,...,8239,True,26000000,27000000,2436,26,chr19:26,8239,1,1
3788352,ad5b2240-893f-4ed0-a157-c2be66d8d754,0,1919,1919,4,127067225,127074760,60,batch04,127070992,...,7535,True,127000000,128000000,665,127,chr4:127,7535,2,2
7462945,1cbc7b8c-2ad8-4365-a57c-037fc41af923,1305,6701,5396,8,21218739,21225694,1,batch04,21222216,...,-21218740,False,21000000,22000000,1163,21,chr8:21,6955,3,3
7463448,9af4c1b9-a795-43a1-9669-55aff68e3b9d,426,5786,5360,8,21241017,21247152,60,batch03,21244084,...,-21241018,False,21000000,22000000,1163,21,chr8:21,6135,4,3


In [27]:
gene_map = df[['gene_name', 'gene_biotype', 'read_name', 'bin_name']].drop_duplicates()
gene_map = gene_map[gene_map['gene_name'] != '-1']
gene_map = gene_map.reset_index(drop=True)

gene_map.head()

Unnamed: 0,gene_name,gene_biotype,read_name,bin_name
0,Ulk4,protein_coding,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,chr9:121
1,Smarca2,protein_coding,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,chr19:26
2,Dlgap3,protein_coding,ad5b2240-893f-4ed0-a157-c2be66d8d754,chr4:127
3,Gm56531,lncRNA,3f354c45-5e48-4f6d-8c7e-05369432b344,chr12:8
4,Twsg1,protein_coding,d4626feb-16a2-4aac-8145-53e89b60bf7c,chr17:66


In [None]:
break

In [None]:
df.columns

In [None]:
df['duplicate_maps'] = df.groupby(['read_name', 'read_start'])['gene_name'].transform('nunique')
test = df[df['duplicate_maps'] > 1]
test = test.sort_values(by=['read_name', 'ref_start'])
print(f"{test.shape=}")
test[['read_name', 'read_start', 'ref_start', 'chrom', 'gene_name', 'gene_biotype', 'gene_start']].head()

In [None]:
break

In [None]:
df['gene_name'].value_counts()

In [None]:
df['read_name'].value_counts()

In [None]:
gdf = pd.read_parquet(gene_path)
print(f"{gdf.shape=}")
gdf.head()

# gdf_pr = pr.PyRanges(gdf)


In [None]:
df.head()