In [1]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
import gget
import scipy
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import random
from importlib import reload
import warnings
import ot
from scipy.spatial.distance import pdist, squareform

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

"""WARNING: no warnings"""
warnings.filterwarnings("ignore")

# local imports
import anndata_utils as anntools

source_path = os.path.abspath("../source/")
sys.path.append(source_path)
import centrality as central
import matrix
import utils as ut
import plotting as plt2

# Load data

In [2]:
population_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/anndata/population_mESC_100000_features.h5ad"
start_time = time.time()  # Record the start time
adata = sc.read_h5ad(population_path)
end_time = time.time()  # Record the end time
print(f"Time taken to read the file: {end_time - start_time:.2f} seconds")
# free up some memory
del adata.uns['gene_map']
del adata.uns['gdf']
sc.logging.print_memory_usage()
adata

Time taken to read the file: 90.61 seconds
Memory usage: current 1.79 GB, difference +1.79 GB


AnnData object with n_obs × n_vars = 25681 × 2756467
    obs: 'bin_index', 'bin_start', 'bin_end', 'bin', 'chrom', 'chrom_bin', 'degree', 'genes', 'n_genes', 'ATACSeq_1', 'ATACSeq_2', 'ATACSeq_3', 'CTCF', 'H3K27ac', 'H3K27me3', 'RNA_1', 'RNA_2', 'RNA_3', 'RNA_4', 'RNA_5', 'RNA_6'
    var: 'read_index', 'basename', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp', 'genes', 'n_genes'
    uns: 'base_resolution', 'chrom_sizes', 'intervals'
    layers: 'H'

# QC

In [3]:
def find_outliers_iqr(df_column):
  """
  Identifies outliers in a pandas DataFrame column using the IQR method.

  Args:
    df_column: A pandas Series representing the column to analyze.

  Returns:
    A boolean mask with True for outliers and False otherwise.
  """
  Q1 = df_column.quantile(0.15)
  Q3 = df_column.quantile(0.85)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  return (df_column < lower_bound) | (df_column > upper_bound)

adata.obs['outlier'] = find_outliers_iqr(adata.obs['degree'])

print(adata.obs[adata.obs['outlier']][['bin', 'degree', 'outlier']].shape)
adata.obs[adata.obs['outlier']][['bin', 'degree', 'outlier']].head()

(423, 3)


Unnamed: 0_level_0,bin,degree,outlier
bin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr8:212,11613,1039,True
chr10:579,14526,832,True
chrX:1399,26032,144,True
chrX:1232,25865,96,True
chrX:1233,25866,71,True


In [4]:
# remove outliers
remove_bins = adata.obs[adata.obs['outlier']].index.to_list()
print(f"Removing top {len(remove_bins)} outlier loci: ")
print(remove_bins[:10])

adata = adata[~adata.obs_names.isin(remove_bins), :].copy()

print('done!')

Removing top 423 outlier loci: 
['chr8:212', 'chr10:579', 'chrX:1399', 'chrX:1232', 'chrX:1233', 'chrX:1239', 'chr8:201', 'chrX:1241', 'chr18:31', 'chr8:211']
done!


# Add features

In [None]:
# add the principal singular value of the incidence matrix
H = adata.to_df().copy()
print(f"Raw: {H.shape=}")
H = H.T.drop_duplicates().T
print(f"De-duped: {H.shape=}")

svd = TruncatedSVD(n_components=1, n_iter=10)
adata.obs['singular_vector_1'] = ut.min_max(svd.fit_transform(H))

# hypergraph centralities
hge_functions = {
    'hge_logexp_unweighted' : {
        'function' : 'log-exp',
        'weights' : None,
    },
    'hge_logexp_RNA_weighted' : {
        'function' : 'log-exp',
        'weights' : 1 / (adata.obs.loc[H.index, 'RNA_2'].values + 1)
    },
    'hge_logexp_ATAC_weighted' : {
        'function' : 'log-exp',
        'weights' : 1 / (adata.obs.loc[H.index, 'ATACSeq_1'].values + 1)
    },
}


hge_centralities = []

for label, d in hge_functions.items():
    print(f"Starting {label}....")
    start_time = time.time()  # Record start time
    node, edge = central.nonlinear_eigenvector_centrality(
        H,
        function=d['function'],
        node_weights=d['weights'],
    )

    hge_centralities.append(label)
    adata.obs[label] = ut.min_max(node)

    end_time = time.time()  # Record end time
    print(f"{label} calculation took: {end_time - start_time:.2f} seconds")

adata

# Feature X

In [None]:
adata.obs.columns

In [None]:
break

In [None]:
features = [
    'degree', 'n_genes', 'ATACSeq_1', 'ATACSeq_2', 'ATACSeq_3',
    'CTCF', 'H3K27ac', 'H3K27me3', 
    'RNA_1', 'RNA_2', 'RNA_3', 'RNA_4', 'RNA_5', 'RNA_6', 
]
adata.obsm['X_feature'] = adata.obs[features]
adata

In [None]:
break

# Cliqiue-expansion

In [None]:
matrix.expand_and_normalize_anndata(adata)
adata