## Identify and extract universally closed regions across all cell types in sc-zhang reference matrix
- Identifies regions with low accessibility across all cell types. 
- Uses the zhang sc-atac-seq reference matrix.
- Filters regions below a defined threshold of 0.4  

## Import required libraries

In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

## Load sc-zhang reference matrix

In [None]:
# Open the formatted Zhang file 
subset = pd.read_pickle('/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/subset_by_celltype.pkl')

In [None]:
# Get first 5 entries to check
subset.head()

## Explore summary statistics

In [None]:
# Get summary statistics for numerical columns
subset_stats = subset.describe()

In [None]:
print(subset_stats)

##  Identify regions with universally low accessibility

In [None]:
# Define low accessibility threshold (adjust as needed)
low_threshold = 0.4  

# Identify rows where ALL cell types have accessibility ≤ low_threshold
low_accessibility_regions = subset[(subset <= low_threshold).all(axis=1)]

In [None]:
print(low_accessibility_regions)

## Extract genomic coordinates

In [None]:
# Extract the region column and split it into 'chr', 'start', and 'end' columns
regions = low_accessibility_regions.index.to_series().str.split("[:-]", expand=True)
regions.columns = ["chr", "start", "end"]

# Convert start and end to integers
regions["start"] = regions["start"].astype(int)
regions["end"] = regions["end"].astype(int)

In [None]:
print(regions)

# Save as bed file

In [None]:
# Define BED file path
bed_file_path = "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/02_split_chromosomes/universal_closed_regions/universal_closed_regions.bed"

# Save as a BED file 
regions.to_csv(bed_file_path, sep="\t", index=False, header=False)

print(f"Saved BED file at: {bed_file_path}")