# Weighted accessibility scores from Corces bigwig files in Gabriel reference matrix regions
- Calculating weighted ATAC-seq scores per genomic region across samples

## Import required libraries

In [None]:
import subprocess
import os
import pandas as pd

## Define directories and load files

In [None]:
# Path to the original Gfeller regions BED file
gfeller_bed = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/original_reference_regions.bed"

# Directory containing BedGraph files
bedgraph_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/corces/LUSC_merged_samples/"

# Output directory for results
output_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/corces/LUSC_per_sample_scores/"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Helper function to check if a value is a valid float
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

## Process each sample

In [None]:
def process_single_sample(sample_file, gfeller_bed, bedgraph_dir, output_dir):
    sample_name = os.path.splitext(sample_file)[0]
    bedgraph_path = os.path.join(bedgraph_dir, sample_file)

    overlap_bed = os.path.join(output_dir, f"{sample_name}_overlap.bed")

    # Bedtools intersect command
    intersect_cmd = [
        "bedtools", "intersect",
        "-a", bedgraph_path,
        "-b", gfeller_bed,
        "-wa", "-wb"
    ]

    # Run bedtools intersect
    try:
        with open(overlap_bed, 'w') as out_bed:
            subprocess.run(intersect_cmd, stdout=out_bed, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error intersecting {sample_file}: {e}")
        return

    # Check if overlap file is empty
    if os.path.getsize(overlap_bed) == 0:
        print(f"No overlaps found for {sample_name}.")
        os.remove(overlap_bed)
        return

    # Validate lines
    with open(overlap_bed, 'r') as f:
        for i, line in enumerate(f, start=1):
            fields = line.strip().split('\t')
            if len(fields) < 7:
                print(f"Line {i} has insufficient columns: {line.strip()}")
                continue
            chrom, start, end, score, gf_chrom, gf_start, gf_end = fields[:7]
            if not (start.isdigit() and end.isdigit()):
                print(f"Line {i} has invalid coordinates: {line.strip()}")
                continue
            if not is_float(score):
                print(f"Line {i} has invalid score: {line.strip()}")
                continue

    # Read into a DataFrame
    overlap_df = pd.read_csv(
        overlap_bed,
        sep='\t',
        header=None,
        names=['BedGraph_Chromosome', 'BedGraph_Start', 'BedGraph_End', 'BedGraph_Score',
               'Gfeller_Chromosome', 'Gfeller_Start', 'Gfeller_End'],
        dtype={
            'BedGraph_Chromosome': str,
            'BedGraph_Start': int,
            'BedGraph_End': int,
            'BedGraph_Score': float,
            'Gfeller_Chromosome': str,
            'Gfeller_Start': int,
            'Gfeller_End': int
        }
    )

    # Compute overlap lengths and weighted scores
    overlap_df['Overlap_Start'] = overlap_df[['BedGraph_Start', 'Gfeller_Start']].max(axis=1)
    overlap_df['Overlap_End'] = overlap_df[['BedGraph_End', 'Gfeller_End']].min(axis=1)
    overlap_df['Overlap_Length'] = overlap_df['Overlap_End'] - overlap_df['Overlap_Start']
    overlap_df = overlap_df[overlap_df['Overlap_Length'] > 0]
    overlap_df['Weighted_Score'] = overlap_df['BedGraph_Score'] * overlap_df['Overlap_Length']

    # Aggregate per region
    grouped = overlap_df.groupby(['Gfeller_Chromosome', 'Gfeller_Start', 'Gfeller_End']).agg(
        total_weighted_score=('Weighted_Score', 'sum'),
        total_overlap_length=('Overlap_Length', 'sum')
    ).reset_index()

    # Compute weighted average per region
    grouped['Weighted_Average_Score'] = grouped['total_weighted_score'] / grouped['total_overlap_length']
    grouped['Weighted_Average_Score'] = grouped['Weighted_Average_Score'].round(4)

    # Select relevant columns
    final_df = grouped[['Gfeller_Chromosome', 'Gfeller_Start', 'Gfeller_End', 'Weighted_Average_Score']]
    final_df.columns = ['Chromosome', 'Start', 'End', 'Weighted_Average_Score']

    # Save results for this sample
    sample_output_csv = os.path.join(output_dir, f"{sample_name}_weighted_scores.csv")
    final_df.to_csv(sample_output_csv, index=False)
    print(f"Saved per-region weighted average scores for {sample_name} to {sample_output_csv}")

    # Clean up intermediate file
    os.remove(overlap_bed)

In [None]:
# List all BedGraph files in the directory
bedgraph_files = [f for f in os.listdir(bedgraph_dir) if f.endswith('.bedGraph')]

# Filter for BRCA samples (assuming filenames start with BRCA)
brca_files = [f for f in bedgraph_files if f.startswith('LUSC')]
brca_files = brca_files[:10]  # Take the first 10 BRCA samples

print(f"Processing {len(brca_files)} BRCA samples...")

In [None]:
for sample_file in brca_files:
    process_single_sample(
        sample_file=sample_file,
        gfeller_bed=gfeller_bed,
        bedgraph_dir=bedgraph_dir,
        output_dir=output_dir
    )