# Feature Classification


- merging criteria control (they should use the reference location to decide whether an alignment should happen)
    i adapted the merging criteria so that they refer to the position on the reference (instead of adjacent features)of course!

In [1]:
import os
from Bio import SeqIO
import matplotlib.pyplot as plt

from collections import defaultdict
from Bio.SeqFeature import SeqFeature, FeatureLocation

## 1. Reference Fasta Parsing 

In [2]:
def parse_reference_fasta(ref_fasta):
    """
    Read reference FASTA, parse known feature segments, return dict of feature lengths.
    """
    feature_lengths = {}
   
    for record in SeqIO.parse(ref_fasta, "fasta"):
        # Possibly parse annotation from record.description, or use a separate table.
        feature_lengths[record.id] = len(record.seq)    


    # Compute combined length
    if "Transgene" in feature_lengths and "3UTR" in feature_lengths:
        feature_lengths["Insert"] = (
            feature_lengths["Transgene"] + feature_lengths["3UTR"]
        )
    else:
        print("Warning: Missing feature lengths for Transgene and/or 3UTR.")

    return feature_lengths

ref_fasta_path = "00-Data/references/ref_28s_features.fasta"
ref_lengths = parse_reference_fasta(ref_fasta_path)
ref_lengths
ref_Transgene_len = ref_lengths["Transgene"]

## 2. RepeatMasker Reference Location Parser

Get the location for each read-feature pair in relation to the referece/database

In [3]:
# Store as (read_id, type) -> [(start, end, strand), ...]
read_ref_positions = defaultdict(list)
for line in open("00-Data/repeatmaskered/sample-cutadapt_sup-filtered.fasta.out.xm"):
    # Split the line on whitespace
    line=line.strip().split()
    # Extract the relevant information
    read_id = f"{line[4]}"
    strand = line[8]
    type = line[9].split("#")[0]

    # Pick start/end depending on strandedness
    if strand == "+":
        start = int(line[10])
        end = int(line[11])
    elif strand == "C":
        start = int(line[12])
        end = int(line[11])
    
    read_ref_positions[(read_id, type)].append((start, end, strand))

read_ref_positions[("e6a618ab-0e63-4525-b142-63ee327895e0", "Transgene")]

[(1123, 1402, 'C'), (1, 1362, 'C')]

## 3. Parse Genbank File 

In [4]:

##################################################
# 2. PARSE GENBANK READS
##################################################

def get_feature_info(read_record, allowed_gap=10):
    """
    Extract relevant feature information from each read’s GenBank annotation.
    Return:
      - dictionary of feature lengths (keyed by feature name)
      - largest contiguous insertion region (Transgene+UTR, ignoring 3end/5end in between)
      - list of inter-feature gaps
    """
    # track the length of each feature
    feature_dict = defaultdict(list)
    # track the sorted positions of features to find inter-feature gaps
    feature_positions = []
    
    for feat in read_record.features:
        # E.g., "3end_200nt", "5end_200nt", "3UTR", "Transgene", etc.
        start = int(feat.location.start)
        end   = int(feat.location.end)
        length = end - start
        feature_dict[feat.type] += [{
            "start": start,
            "end": end,
            "length": length, 
            "strand": feat.strand
        }]
        feature_positions.append((start, end, feat.type))

    # Sort features by their start coordinate to find largest contiguous insertion
    feature_positions.sort(key=lambda x: x[0])

    # Identify consecutive "Transgene + 3UTR" region

    # We’ll store all consecutive segments that have type in ["Transgene", "UTR"] 
    valid_types = {"Transgene","3UTR"}  # Adjust if your naming differs
    longest_contig_length = 0

    current_start, current_end = None, None
    
    for start, end, ftype in feature_positions:
        # Ignore features that are not part of the insertion region
        if ftype not in valid_types:
            continue
        
        # Initialize the current segment with the first feature
        if current_start is None:
            current_start = start
            current_end = end
            continue

        # Check if the current segment is entirely part of the previous segment
        if start >= current_start and end <= current_end:
            continue

        # Check if the segment can be merged with the current segment
        if start <= current_end + allowed_gap:
            current_end = max(current_end, end)
        else:
            # Update the longest segment length
            longest_contig_length = max(longest_contig_length, current_end - current_start)
            # Start a new current segment
            current_start, current_end = start, end

    # Final check for the last segment
    if current_start is None:
        max_insertion_len = 0
    else:
        max_insertion_len = max(longest_contig_length, current_end - current_start)


    #  compute inter-feature gaps
    # (distance between end of feature i and start of feature i+1)
    inter_gaps = []
    for i in range(len(feature_positions) - 1):
        _, end_i, _ = feature_positions[i]
        start_j, _, _ = feature_positions[i+1]
        gap_len = start_j - end_i
        inter_gaps.append(gap_len)

    longest_inter_gap = max(inter_gaps) if inter_gaps else 0

    return feature_dict, max_insertion_len, longest_inter_gap

# Test run the function
gb_file = "00-Data/feature_annotated/Repeatmasker_transgene_filtered.gb"
for read_record in SeqIO.parse(gb_file, "genbank"):
    feature_dict, max_insertion_len, max_inter_gap = get_feature_info(read_record)
    print(f"Read {read_record.id}:")    
    print(f"  Feature lengths: {feature_dict}")
    print(f"  Insertion length: {max_insertion_len}")
    print(f"  Longest inter-feature gap: {max_inter_gap}")
    # break

Read 0702e822-673e-4cf2-a935-b9c8fe1fff04:
  Feature lengths: defaultdict(<class 'list'>, {'5end_400nt': [{'start': 0, 'end': 205, 'length': 205, 'strand': 1}], '5end_200nt': [{'start': 205, 'end': 405, 'length': 200, 'strand': 1}], 'Transgene': [{'start': 405, 'end': 1811, 'length': 1406, 'strand': 1}], '3UTR': [{'start': 1811, 'end': 1912, 'length': 101, 'strand': 1}], '3end_200nt': [{'start': 1912, 'end': 2112, 'length': 200, 'strand': 1}], '3end_400nt': [{'start': 2112, 'end': 2311, 'length': 199, 'strand': 1}]})
  Insertion length: 1507
  Longest inter-feature gap: 0
Read cbfe3b25-179c-40c2-9f86-08dd1038c4ae:
  Feature lengths: defaultdict(<class 'list'>, {'3end_400nt': [{'start': 2103, 'end': 2304, 'length': 201, 'strand': 1}], '3end_200nt': [{'start': 1903, 'end': 2103, 'length': 200, 'strand': 1}], '3UTR': [{'start': 1803, 'end': 1903, 'length': 100, 'strand': 1}], 'Transgene': [{'start': 401, 'end': 1803, 'length': 1402, 'strand': 1}], '5end_200nt': [{'start': 201, 'end': 401,

## Helper Function 

## 4. Classification Logic

In [5]:

##################################################
# 3. CLASSIFICATION LOGIC
##################################################

def classify_read(
    read_id,
    feature_dict,
    max_insertion_len,
    max_inter_gap,
    ref_lengths,
    random_integration_threshold=50,
    indels=10,
    min_gap_distance=20,
):
    """
    Return an integer or string that indicates the category of the read based on logic:
      1) Complete Integration
      2) FivePrimeTruncations 
      4) Snapback
      5) Jumps within the Insert
      6) Duplication ROI
      7) Random insertion
      8) Other
    """

    
    # === Helper variables ===
    # convenience
    ref_insertion_len = ref_lengths["Insert"]
    # e.g. 3end_200nt length from reference
    ref_5end_200_len = ref_lengths["5end_200nt"]

    # Check # of times each feature is apparent
    # We define “duplicated” if any feature type count is >=2
    any_feature_duplicated = any(len(v) >= 2 for v in feature_dict.values())


    # Grab 5end_200nt length in this read from feature_dict. The key is "5end_200nt"
    length_5end_200nt_gb = 0
    if "5end_200nt" in feature_dict:
        # Get the longest 5end_200nt segment length
        length_5end_200nt_gb = max(
            seg["length"] for seg in feature_dict["5end_200nt"]
        )
    

    # # [TYPE] 7. Random insertion non-annotated
    if (max_inter_gap >= random_integration_threshold 
        and not any_feature_duplicated):
        return 7, "RANDOM_INSERT"

    
    # [TYPE] 2. FivePrimeTruncations Transgene integration
    # 5end_200nt >= ref_3end_200_len - indels
    #if (max_insertion_len < ref_insertion_len - indels
    if not any_feature_duplicated:
        return 2, "FivePrimeTruncations"

    # [TYPE] 1. Complete Integration
    # insertion_length ~ ref_transgene_length + ref_UTR_length ± Indels
    # also require 3end_200nt >= ref_3end_200_len - indels 
    # if (ref_insertion_len - indels <= max_insertion_len <= ref_insertion_len + indels
    #     and not any_feature_duplicated):
    #     return 1, "COMPLETE"
    
    
    # [TYPE] 4. Snapback
    # "Number of at least 1 featuretype > 1 and in varying orientations"

    for ftype, flist in feature_dict.items():
        # Only consider duplicated features
        if len(flist) <= 1:
            continue

        # Check if there's a mix of strands
        if any(f["strand"] == 1 for f in flist) and any(f["strand"] == -1 for f in flist):
            return 4, "SNAPBACK"

    # [TYPE] 5. Jumps within insert
    # "seperately check whether there are Featuretype Transgene or UTR > 1, gap in between them"
    #   if transgene_count > 1 or UTR_count > 1 and ... => #5
    # no other feature should be in between the duplicated features

    for ftype, flist in feature_dict.items():
        # -- Filter out non-cases
        # Only consider duplicated features and Transgene/UTR
        if (len(flist) <= 1) or (ftype not in {"Transgene", "3UTR"}):
            continue
        # Only consider features that are close to each other
        flist_copy = flist.copy()
        flist_copy.sort(key=lambda x: x["start"])


        # Ensure that the features are close together (within random_integration_threshold) (alternatively check whether different feature in between)
        duplicate_in_proximity = True
        for i in range(1, len(flist_copy)):
            if flist_copy[i]["start"] - flist_copy[i-1]["end"] >= random_integration_threshold:
                duplicate_in_proximity = False
                break

        # -- Calculate whether there's a gap between the features
    
        # get a list of reference coordinates of the feature type 
        ftype_ref_positions = read_ref_positions[(read_id, ftype)]
        ftype_ref_positions.sort(key=lambda x: x[0])
        # check list for overlapping element
        has_no_overlap = True
        for i in range(1, len(ftype_ref_positions)):
            if ftype_ref_positions[i][0] < ftype_ref_positions[i-1][1]:
                has_no_overlap = False
                break

        if has_no_overlap and duplicate_in_proximity:
            return 5, "Jumps"


    # 6. Duplication ROI
    # "Number of at least 1 featuretype >1 and in same orientation"
    # Or insertion length > ref_insertion_len + indels
    if any_feature_duplicated: # and not has_no_overlap: # TODO CHECK if we can add has_no_overlap
        return 6, "DUPLICATION_ROI"

    # 8. Other
    # If none of the above matched, but no random_integration_detected
    return 8, "OTHER"


gb_file = "00-Data/feature_annotated/Repeatmasker_transgene_filtered.gb"
for read_record in SeqIO.parse(gb_file, "genbank"):
    if read_record.id != "e6a618ab-0e63-4525-b142-63ee327895e0_trimmed":
        continue
    print(read_record.id)
    feature_dict, max_insertion_len, max_inter_gap = get_feature_info(read_record)
    print(classify_read(read_record.id, feature_dict, max_insertion_len, max_inter_gap, ref_lengths))
    break

## Main Pipeline 

In [6]:
import matplotlib.pyplot as plt
from Bio import SeqIO
from collections import defaultdict

def main(ref_fasta, gb_file, out_gb="reads_categorized.gb", indels=20):
    # 0) Counters variables
    transgene_counter_category = defaultdict(lambda: defaultdict(int))

    # 1) Get reference feature lengths
    ref_lengths = parse_reference_fasta(ref_fasta)

    ref_insertion_len = ref_lengths["Insert"]
    ref_Transgene_len = ref_lengths["Transgene"]

    # 2) Prepare category structure & counters
    categorized_dict = defaultdict(list)  # category_num -> list of SeqRecords
    category_names = {
        1: "COMPLETE",
        2: "FivePrimeTruncations",
        3: "28S_DEL_5END",
        4: "SNAPBACK",
        5: "Jumps",
        6: "DUPLICATION_ROI",
        7: "RANDOM_INSERT",
        8: "OTHER",
    }

    # 3) SINGLE PASS: Classify each read, skip discarded, count only if not discarded
    for read_record in SeqIO.parse(gb_file, "genbank"):
        # a) Get feature info (same as your code)
        feature_dict, max_insertion_len, max_inter_gap = get_feature_info(read_record)

        # b) Classify read
        category_num, category_str = classify_read(
            read_record.id,
            feature_dict,
            max_insertion_len,
            max_inter_gap,
            ref_lengths,
            indels=indels
        )

        # c) Check if discarded
        if category_num == -1:
            discarded_counter += 1
            continue  # do not count anything for this read


        # d) transgene counting
        max_Transgene_len = 0
        if "Transgene" in feature_dict:
            max_Transgene_len = max(seg["length"] for seg in feature_dict["Transgene"])

        if (ref_Transgene_len - indels <= max_Transgene_len <= ref_Transgene_len + indels):
            transgene_counter_category[category_str]["within"] += 1
        elif max_Transgene_len < (ref_Transgene_len - indels):
            transgene_counter_category[category_str]["less"] += 1

        # e) Label read ID and store in category
        read_record.id = f"{category_names[category_num]}-{read_record.id}"
        read_record.name = read_record.id
        read_record.description = f"Category {category_num}: {category_str}"

        categorized_dict[category_num].append(read_record)

    # 4) Build final list of reads, sorted by category
    all_categorized_reads = []
    for cnum in sorted(categorized_dict.keys()):
        all_categorized_reads.extend(categorized_dict[cnum])

    # 5) Write out the final GenBank
    SeqIO.write(all_categorized_reads, out_gb, "genbank")

    # 6) Make a bar chart
    #    The categories go from 1..8. We'll use 0 for "discarded" if you want to show it
    category_counts = []
    for cnum in range(1, 9):
        cat_count = len(categorized_dict.get(cnum, []))
        category_counts.append((cnum, cat_count))

    labels = [f"{cnum}:{category_names[cnum]}" for cnum, _ in category_counts]
    counts = [count for _, count in category_counts]

    plt.figure(figsize=(10,6))
    bars = plt.bar(labels, counts)
    plt.xticks(rotation=45, ha="right")
    plt.ylabel("Number of Reads")
    plt.title("Reads per Category")
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, height, str(height),
                 ha='center', va='bottom', fontsize=9)
    plt.tight_layout()
    plt.savefig("00-Data/read_category.png", dpi=150)
    plt.close()

    # 7) Print summary
    print(f"Wrote categorized GenBank to: {out_gb}")
    print("Category counts:")
    for cnum, cnt in category_counts:
        print(f"  {cnum} ({category_names[cnum]}): {cnt}")

    print(f"\n=== Transgene Counting Results ===")
    print(f"Total reads where max_Transgene_len is within ±{indels}: {sum(x['within'] for x in transgene_counter_category.values())}")
    print(f"Total reads where max_Transgene_len < (ref_Transgene_len - {indels}): {sum(x['less'] for x in transgene_counter_category.values())}")
    print()
    for category, counts in transgene_counter_category.items():
        print(f"Category {category}:")
        print(f"  Number of reads where max_Transgene_len is within ±{indels}: {counts['within']}")
        print(f"  Number of reads where max_Transgene_len < (ref_Transgene_len - {indels}): {counts['less']}")
        print()
   
    
    return categorized_dict 

if __name__ == "__main__":
    ref_fasta = "00-Data/references/ref_28s_features.fasta"
    gb_file   = "00-Data/feature_annotated/Repeatmasker_transgene_filtered.gb"
    out_gb    = "00-Data/feature_annotated/Repeatmasker_transgene_classified.gb"

    indels= 10

    categorized_dict = main(ref_fasta, gb_file, out_gb, indels=indels)


Wrote categorized GenBank to: 00-Data/feature_annotated/Repeatmasker_transgene_classified.gb
Category counts:
  1 (COMPLETE): 0
  2 (FivePrimeTruncations): 755
  3 (28S_DEL_5END): 0
  4 (SNAPBACK): 3
  5 (Jumps): 60
  6 (DUPLICATION_ROI): 25
  7 (RANDOM_INSERT): 3
  8 (OTHER): 0

=== Transgene Counting Results ===
Total reads where max_Transgene_len is within ±10: 488
Total reads where max_Transgene_len < (ref_Transgene_len - 10): 353

Category FivePrimeTruncations:
  Number of reads where max_Transgene_len is within ±10: 482
  Number of reads where max_Transgene_len < (ref_Transgene_len - 10): 268

Category Jumps:
  Number of reads where max_Transgene_len is within ±10: 0
  Number of reads where max_Transgene_len < (ref_Transgene_len - 10): 60

Category DUPLICATION_ROI:
  Number of reads where max_Transgene_len is within ±10: 4
  Number of reads where max_Transgene_len < (ref_Transgene_len - 10): 21

Category RANDOM_INSERT:
  Number of reads where max_Transgene_len is within ±10: 0
  



## Merch Flanking region

In [7]:

def combine_qualifiers(feature_list):
    # Merge all qualifiers from a list of features into a single dict.
    new_quals = {}
    for ft in feature_list:
        for k, vals in ft.qualifiers.items():
            if k not in new_quals:
                new_quals[k] = list(vals)
            else:
                new_quals[k].extend(vals)
    return new_quals

def merge_all_adjacent_features(feature_list, feature_types, new_feature_type):
    # Merge overlapping or adjacent features of 'feature_type' in a list of features.
    # (Sort by start, combine any that overlap or directly touch (start <= current_end + 1),
    # use earliest start / latest end, merge qualifiers)
    merged = []
    if not feature_list:
        return merged  # no features of this type
    
    # Sort by start location
    feature_list.sort(key=lambda f: f.location.start.position)

    current_feat = feature_list[0]
    overlapping_feats = [current_feat]

    for f in feature_list[1:]:
        start_cur = current_feat.location.start.position
        end_cur = current_feat.location.end.position
        start_f = f.location.start.position
        end_f = f.location.end.position

        # Check if the feature types are in the right order to merged
        has_right_order = current_feat.type == feature_types[0] and f.type == feature_types[1]

        # Check if the features are overlapping or adjacent
        is_adjacent_on_ref = (
            int(current_feat.qualifiers["ref_position_end"][0]) == ref_lengths[current_feat.type]
            and
            int(f.qualifiers["ref_position_start"][0]) == 0
        )
        
        # if not is_adjacent_on_ref:
        #     print(current_feat.qualifiers["ref_position_end"][0], f.qualifiers["ref_position_start"][0])

        # If f overlaps or touches current_feat AND the feature types match the condition
        if has_right_order and is_adjacent_on_ref and start_f <= end_cur + 1:
            new_start = min(start_cur, start_f)
            new_end = max(end_cur, end_f)
            strand = current_feat.location.strand
            # Update the current feature's location
            current_feat.location = FeatureLocation(new_start, new_end, strand=strand)
            # Keep track for qualifiers merging
            overlapping_feats.append(f)
        else:
            # Finalize the current_feat with merged qualifiers
            current_feat.qualifiers = combine_qualifiers(overlapping_feats)
            current_feat.type = new_feature_type
            merged.append(current_feat)

            # Start a new merge group
            current_feat = f
            overlapping_feats = [f]

    # Final flush: merge the last group
    current_feat.qualifiers = combine_qualifiers(overlapping_feats)
    current_feat.type = new_feature_type
    merged.append(current_feat)

    return merged

def merge_ends_in_record(record):
    """
    1. Rename feature types (3end_200nt/3end_400nt -> 3end, 5end_200nt/5end_400nt -> 5end).
    2. Merge adjacent/overlapping 3end features into one, and 5end features into one.
    Returns the updated record.
    """
    # Separate features into 3end, 5end, and others
    feats_3end = []
    feats_5end = []
    other_feats = []

    # Feature types to merge, the order is important. 
    # Only ones from feature_types_Xend[0] -> feature_types_Xend[1] will be merged.
    feature_types_3end = ("3end_200nt", "3end_400nt")
    feature_types_5end = ("5end_400nt", "5end_200nt")

    for feat in record.features:
        if feat.type in feature_types_3end:
            feats_3end.append(feat)
        elif feat.type in feature_types_5end:
            feats_5end.append(feat)
        else:
            other_feats.append(feat)

    # Merge each category
    merged_3end = merge_all_adjacent_features(feats_3end, feature_types_3end, new_feature_type="3end")
    merged_5end = merge_all_adjacent_features(feats_5end, feature_types_5end, new_feature_type="5end")

    # Recombine with the unchanged features
    record.features = merged_5end + other_feats + merged_3end
    return record

def rename_and_merge_ends(input_gb, output_gb):
    """
    Reads records from 'input_gb', renames/merges end features in each,
    and writes the updated records to 'output_gb' without overwriting the input.
    """
    # Parse all records
    records = list(SeqIO.parse(input_gb, "genbank"))
    updated_records = []

    # Process each record
    for rec in records:
        updated_rec = merge_ends_in_record(rec)
        updated_records.append(updated_rec)

    # Write out to a new file
    SeqIO.write(updated_records, output_gb, "genbank")

if __name__ == "__main__":
    in_file = "00-Data/feature_annotated/Repeatmasker_transgene_classified.gb"
    out_file = "00-Data/feature_annotated/Repeatmasker_transgene_classified_merged.gb"

    rename_and_merge_ends(in_file, out_file)
    print(f"Updated GenBank file written to: {out_file}")


Updated GenBank file written to: 00-Data/feature_annotated/Repeatmasker_transgene_classified_merged.gb




## Count reads that would have been positive in a ddPCR 

In [8]:

# ------------------------------------------------------------------
# 1. Imports and Setup
# ------------------------------------------------------------------

from Bio import SeqIO

# Define ddPCR primers and probe
FORWARD_PRIMER = "CAGTGCTCTGAATGTCAAAGTG".upper()     # Example: oGE980
REVERSE_PRIMER = "ACTAGTCAATAATCAATGTCAACGG".upper()  # Example: oGE981
PROBE_SEQ = "ACGGCGGGAGTAACTATGACTCTCT".upper()      # Example: oGE982

# Parameters
MISMATCH_THRESHOLDS = [0, 2]        # 0 for exact matches, n for up to n mismatches
DISTANCE_THRESHOLDS = [200, 500]    # Maximum distance between primers in bases

# Path to the FASTQ file (update this path as needed)
FASTQ_FILE = "00-Data/samples_filtered/sample-cutadapt_sup-filtered-trimmed.fastq"

# ------------------------------------------------------------------
# 2. Helper Functions
# ------------------------------------------------------------------

def reverse_complement(seq):
    """Return the reverse complement of a DNA sequence."""
    complement_map = str.maketrans("ACGTN", "TGCAN")
    return seq.translate(complement_map)[::-1]

def hamming_distance(s1, s2):
    """
    Calculate the Hamming distance between two strings.
    Returns the number of mismatches. Assumes len(s1) == len(s2).
    """
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def find_approximate_matches(target, query, max_mismatches):
    """
    Find all start positions in 'target' where 'query' matches with ≤max_mismatches.
    Returns a list of start indices.
    """
    matches = []
    t_len = len(target)
    q_len = len(query)
    for start in range(t_len - q_len + 1):
        window = target[start:start + q_len]
        mismatches = hamming_distance(window, query)
        if mismatches is not None and mismatches <= max_mismatches:
            matches.append(start)
    return matches

# Precompute reverse complements
REVERSE_PRIMER_RC = reverse_complement(REVERSE_PRIMER)
FORWARD_PRIMER_RC = reverse_complement(FORWARD_PRIMER)
PROBE_RC = reverse_complement(PROBE_SEQ)

# ------------------------------------------------------------------
# 3. Main Logic
# ------------------------------------------------------------------

# Initialize counters for each condition
# Structure: positive_reads[(max_mismatches, max_distance)] = set of read_ids
positive_reads = { (mm, dist): set() for mm in MISMATCH_THRESHOLDS for dist in DISTANCE_THRESHOLDS }

# Process each read in the FASTQ file
for record in SeqIO.parse(FASTQ_FILE, "fastq"):
    read_id = record.id
    seq = str(record.seq).upper()

    # Consider both forward and reverse strands
    strands = {
        'forward': seq,
        'reverse': reverse_complement(seq)
    }

    for strand, sequence in strands.items():
        for mm in MISMATCH_THRESHOLDS:
            # Find all forward primer matches
            fwd_matches = find_approximate_matches(sequence, FORWARD_PRIMER, mm)
            
            # Find all reverse primer matches
            rev_matches = find_approximate_matches(sequence, REVERSE_PRIMER_RC, mm)
            
            # Iterate through all possible primer pairs
            for fwd_start in fwd_matches:
                for rev_start in rev_matches:
                    # Ensure forward primer is upstream of reverse primer
                    if fwd_start < rev_start:
                        # Calculate distance between primers
                        fwd_end = fwd_start + len(FORWARD_PRIMER)
                        distance = rev_start - fwd_end
                        
                        for dist_threshold in DISTANCE_THRESHOLDS:
                            if distance <= dist_threshold:
                                # Define the region between primers
                                between_region = sequence[fwd_end:rev_start]
                                
                                # Check for probe presence in the between_region
                                probe_forward = find_approximate_matches(between_region, PROBE_SEQ, mm)
                                probe_reverse = find_approximate_matches(between_region, PROBE_RC, mm)
                                
                                if probe_forward or probe_reverse:
                                    positive_reads[(mm, dist_threshold)].add(read_id)
                    
                    # Additionally, check for reverse orientation
                    # Forward primer reverse complement and reverse primer forward
                    # This accounts for reads aligned to the reverse strand
                    # Note: Since we're already considering both strands, this may be redundant
                    # depending on the dataset. Including for completeness.
                    elif rev_start < fwd_start:
                        # Calculate distance between primers
                        rev_end = rev_start + len(REVERSE_PRIMER_RC)
                        distance = fwd_start - rev_end
                        
                        for dist_threshold in DISTANCE_THRESHOLDS:
                            if distance <= dist_threshold:
                                # Define the region between primers
                                between_region = sequence[rev_end:fwd_start]
                                
                                # Check for probe presence in the between_region
                                probe_forward = find_approximate_matches(between_region, PROBE_SEQ, mm)
                                probe_reverse = find_approximate_matches(between_region, PROBE_RC, mm)
                                
                                if probe_forward or probe_reverse:
                                    positive_reads[(mm, dist_threshold)].add(read_id)

# ------------------------------------------------------------------
# 4. Summary of Results
# ------------------------------------------------------------------

print("Number of Positive Reads in ddPCR Assay under Various Conditions:\n")

for mm in MISMATCH_THRESHOLDS:
    for dist in DISTANCE_THRESHOLDS:
        count = len(positive_reads[(mm, dist)])
        mismatch_desc = f"≤{mm} mismatches" if mm > 0 else "Exact matches (0 mismatches)"
        print(f"- {mismatch_desc} & Distance ≤{dist} bases: {count} reads")

print("\nProcessing Complete.")


Number of Positive Reads in ddPCR Assay under Various Conditions:

- Exact matches (0 mismatches) & Distance ≤200 bases: 470 reads
- Exact matches (0 mismatches) & Distance ≤500 bases: 470 reads
- ≤2 mismatches & Distance ≤200 bases: 519 reads
- ≤2 mismatches & Distance ≤500 bases: 519 reads

Processing Complete.
