# Identification and extraction of universal open chromatin regions from raw ATAC-Seq counts
- Loads a raw ATAC-seq count matrix from Gabriel et al
- Computes the median accessibility for each region
- Filters for top 10% most open regions where all samples have accessibility
- Extracts chromosome, start, and end positions from region names
- Saves the resulting open chromatin regions as a BED file

## Import required libraries

In [None]:
import pandas as pd

## Load raw ATAC-seq count matrix

In [None]:
raw_counts_file = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/gabriel/markers_identification_input_files/raw_counts.txt"

In [None]:
df_raw_counts = pd.read_csv(raw_counts_file, sep="\t", header=0, index_col=0)

In [None]:
print(df_raw_counts.head())

## Compute median accessibility per region and define filter for universal open regions

In [None]:
# Compute the median accessibility across all samples (row-wise)
df_raw_counts["median_accessibility"] = df_raw_counts.median(axis=1)

# Compute the 90th percentile threshold based on median accessibility
median_threshold = df_raw_counts["median_accessibility"].quantile(0.90)

# Filter regions where the median accessibility is in the top 10% and all samples have high accessibility
high_threshold = 30  

high_accessibility_regions = df_raw_counts[
    (df_raw_counts["median_accessibility"] >= median_threshold) &
    (df_raw_counts.min(axis=1) >= high_threshold)  

]

In [None]:
print(high_accessibility_regions)

## Extract genomic coordinates from region index

In [None]:
# Extract the region column and split it into 'chr', 'start', and 'end' columns
regions = high_accessibility_regions.index.to_series().str.split("[:-]", expand=True)
regions.columns = ["chr", "start", "end"]

# Convert start and end to integers
regions["start"] = regions["start"].astype(int)
regions["end"] = regions["end"].astype(int)

In [None]:
print(regions)

## Save filtered regions as BED file

In [None]:
# Define BED file path
bed_file_path = "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/02_split_chromosomes/universal_open_regions/universal_accessible_regions.bed"

# Save as a BED file (tab-separated, without headers)
regions.to_csv(bed_file_path, sep="\t", index=False, header=False)

print(f"Saved BED file at: {bed_file_path}")