# Count reads in new cell-type marker regions

## Import required libraries

In [None]:
import pysam
import pandas as pd
import os
import glob

## Define directories

In [None]:
# Define file paths
bed_file = '/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/new_pairwise_cell_types_markers.bed'
output_dir = '/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/subset_trimmed_data_pairwise_markers/'
bam_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/preprocessed_QC__trimmed_data/'

In [None]:
os.makedirs(output_dir, exist_ok=True)  

## Load marker regions and pre-processed bam files

In [None]:
# Load regions from BED file once
regions = pd.read_csv(bed_file, sep='\t', header=None, names=['chrom', 'start', 'end'])

In [None]:
# Get a list of all BAM files that match the pattern "*_final_preprocessed.bam"
bam_files = [f for f in glob.glob(os.path.join(bam_dir, "*.final_preprocessed.bam"))]

# Print filtered BAM files
print("BAM files to be processed:", bam_files)

## Read counts in cell-type marker regions

In [None]:
# Iterate over each BAM file and count reads per region
for bam_file in bam_files:
    sample_name = os.path.splitext(os.path.basename(bam_file))[0]  
    print(f"Processing {sample_name}...")

    # Initialize list to store read counts for the current sample
    read_counts = []

    # Open the BAM file and count reads for each region
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        for _, row in regions.iterrows():
            chrom, start, end = row['chrom'], int(row['start']), int(row['end'])
            read_counts.append(bam.count(chrom, start, end))

    # Add the sample's read counts to the DataFrame and save
    regions[sample_name] = read_counts
    output_file = os.path.join(output_dir, f"{sample_name}_read_counts.csv")
    regions[['chrom', 'start', 'end', sample_name]].to_csv(output_file, index=False)
    
    print(f"Read counts saved for {sample_name} to {output_file}")

print("All samples processed.")