# Read counts in reference matrix regions from Gabriel et al. for hepatocytes

## Import required libraries

In [None]:
import pysam
import pandas as pd
import os
import glob

## Define directories

In [None]:
# Define file paths
bed_file = '/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/original_reference_regions.bed'
output_dir = '/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/subset_trimmed_data_reference_hepa/'
bam_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/preprocessed_QC__trimmed_data/"

In [None]:
os.makedirs(output_dir, exist_ok=True)  

## Load reference matrix regions and pre-processed bam files

In [None]:
# Load regions from BED file
regions = pd.read_csv(bed_file, sep='\t', header=None, names=['chrom', 'start', 'end'])

In [None]:
print(regions.head())

In [None]:
# Get a list of all BAM files
bam_files = [f for f in glob.glob(os.path.join(bam_dir, "*.final_preprocessed.bam"))]

# Filter only the hepatocytes BAM files
hepatocytes_bam_files = [f for f in bam_files if "hepatocytes" in os.path.basename(f)]

# Print filtered hepatocytes BAM files
print("Hepatocytes BAM files:", hepatocytes_bam_files)

## Read counts in reference matrix regions

In [None]:
# Iterate over each hepatocytes BAM file and count reads per region
for bam_file in hepatocytes_bam_files:  
    sample_name = os.path.splitext(os.path.basename(bam_file))[0]
    print(f"Processing {sample_name}...")

    # Initialize list to store read counts for the current sample
    read_counts = []

    # Open the BAM file and count reads for each region
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        for _, row in regions.iterrows():
            chrom, start, end = row['chrom'], int(row['start']), int(row['end'])
            read_counts.append(bam.count(chrom, start, end))

    # Add the sample's read counts to the DataFrame
    regions[sample_name] = read_counts

    # Define output file path
    output_file = os.path.join(output_dir, f"{sample_name}_read_counts.csv")

    # Save the updated DataFrame with the new column
    regions[['chrom', 'start', 'end', sample_name]].to_csv(output_file, index=False)

    print(f"Read counts saved for {sample_name} to {output_file}")

print("All hepatocytes samples processed.")
