# Alignment and Preprocessing 

## Base Calling, Demultiplexing, Adaptertrimming 

In [None]:
!dorado basecaller ../software/dorado-0.9.0-linux-x64/bin/dna_r10.4.1_e8.2_400bps_sup@v5.0.0 pod5 --kit-name EXP-NBD104 --emit-fastq > output_sup.fastq

zsh:1: command not found: dorado


## Snakemake File 

- Read Quality Trimming 
- Minimap2 Alignment (ROI with/wo Transgene)
- Collect IDs from both Alignments
- Filter Cutadapt FastQ file based on IDs 
- Transform filtered FastQ file to FASTA 
- RepeatMasker Feature Annotation

## Transform to Genbank file 

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
# Transform to gb (Genbank) file 
read_dict={}
for seq_record in SeqIO.parse("00-Data/samples_filtered/sample-cutadapt_sup-filtered.fasta", "fasta"):
    if seq_record.id not in read_dict:
        seq_record.annotations = {"molecule_type": "DNA"}
        read_dict[seq_record.id] = seq_record

for line in open("00-Data/repeatmaskered/sample-cutadapt_sup-filtered.fasta.out.xm"):
    # --- Fetch RepeatMasker information
    line = line.strip().split()
    seq_id = line[4]

    # Read information
    read_start = int(line[5]) - 1  # 0-based
    read_end = int(line[6])
    read_strand = line[8]

    # Annotated feature information
    annotated_feature_type = line[9].split("#")[0]

    # Reference information (pick the right one based on strand)
    if read_strand == "+":
        ref_start = int(line[10]) - 1 # 0-based
        ref_end = int(line[11])
    else:
        ref_start = int(line[12]) - 1 # 0-based
        ref_end = int(line[11])

    # --- Add feature to SeqRecord ---
    # Get the SeqRecord, or skip if it doesn't have a sequence (unlikely)
    if not (seq_record := read_dict.get(seq_id)):
        continue
    
    # Create SeqFeature
    feature_strand = +1 if read_strand == '+' else -1
    feature = SeqFeature(
        FeatureLocation(read_start, read_end, strand=feature_strand),
        id=annotated_feature_type,
        type=annotated_feature_type,
        qualifiers={"ref_position_start": ref_start, "ref_position_end": ref_end}
    )
    # Add feature back to SeqRecord
    seq_record.features.append(feature)

# Write to Genbank file
SeqIO.write(list(read_dict.values()), "00-Data/feature_annotated/Repeatmasker_transgene.gb", "genbank")



26386