# Week 2: Open Reading Frames (ORFs) and Translation

## Overview
This week focuses on connecting DNA sequences to proteins:
- File loading and parsing
- Sequence cleaning
- Transcription (DNA -> RNA)
- ORF detection in 3 frames
- Translating ORFs into proteins
- Demonstration using practice3.txt

In [22]:
def loading(filepath):
    fasta_dict= {}
    with open(filepath, "r") as file:
        header= None
        seq= ""
        for line in file:
            line= line.strip()
            if line.startswith(">"):
                if header:
                    fasta_dict[header] = seq
                header= line[1:]
                seq= ""
            else:
                seq+= line.upper()
        if header:
            fasta_dict[header] = seq
    return fasta_dict


def clean(seq):
    cleaned= "".join([base for base in seq.upper() if base in "ATGC"])
    return cleaned


def transcription(cleaned):
    rna= ""
    for base in cleaned:
        if base == "T":
            rna += "U"
        else:
            rna += base
    return rna


def find_orfs(rna):
    start_codon = "AUG"
    stop_codons = ("UAG", "UGA", "UAA")
    orfs= []
    position= []
    longest_orf= {1:None, 2:None, 3:None}
    
    for frame in range(3):
        i = frame
        max_orf= ""
        while i < len(rna)-2:
            codon = rna[i:i+3]
            if codon == start_codon:
                for j in range(i+3, len(rna)-2, 3):
                    stop_codon = rna[j:j+3]
                    if stop_codon in stop_codons:
                        orf= rna[i:j+3]
                        if len(orf) > len(max_orf):
                            max_orf = orf
                            longest_orf[frame + 1] = max_orf
                            
                        orfs.append(orf)
                        position.append((frame + 1, i+1))
                        i= j+3
                        break
                else:
                    i += 3
            else:
                i += 3
    return position, orfs, longest_orf



def translation(rna):
    
    codon_table = {
    "AUG": "M",   # Start codon (Methionine)
    "UUU": "F", "UUC": "F",  # Phenylalanine
    "UUA": "L", "UUG": "L",  # Leucine
    "UAG": "*", "UGA": "*", "UAA": "*" # stop codons
    }

    codons= [rna[i:i+3] for i in range(0, len(rna)-2, 3)]
    aminoacids= []
    for codon in codons:
        aa= codon_table.get(codon, "?")
        if aa == "*":
            break
        aminoacids.append(aa)

    protein = "".join(aminoacids)
    return protein

# File loading and parsing
sequences= loading("practice3.txt")

for header, seq in sequences.items():
    print(f"Header : {header}\n")
    print(f"Original Sequence: {seq}\n")

# Cleaning
    clean_seq= clean(seq)
    print(f"Cleaned Sequence: {clean_seq}\n")

# Transcription
    transcribe = transcription(clean_seq)
    print(f"Transcribed Sequence(RNA): {transcribe}\n")

# Open Reading Frames(ORFs)
    positn, orfs, Longest_orf = find_orfs(transcribe)
    print(f"Frame number & start position: {positn}\n")
    print(f"ORFS: {orfs}\n")
    print(f"Longest ORF: {Longest_orf}\n")


# Translating ORFs into proteins
    for (frame, start_pos), orf in zip(positn, orfs):
        protein = translation(orf)
        print(f"Frame {frame}, Start {start_pos} -> Protein: {protein}\n")

    for frame, orf in Longest_orf.items():
        if orf:
            longest_protein = translation(orf)
            print(f"Longest Protein Sequence: {longest_protein}\n")
        

Header : Human_sequence

Original Sequence: ATGCTAGCTAGCTAACGNNNATGCTAGCTAGCTGAC

Cleaned Sequence: ATGCTAGCTAGCTAACGATGCTAGCTAGCTGAC

Transcribed Sequence(RNA): AUGCUAGCUAGCUAACGAUGCUAGCUAGCUGAC

Frame number & start position: [(1, 1), (3, 18)]

ORFS: ['AUGCUAGCUAGCUAA', 'AUGCUAGCUAGCUGA']

Longest ORF: {1: 'AUGCUAGCUAGCUAA', 2: None, 3: 'AUGCUAGCUAGCUGA'}

Frame 1, Start 1 -> Protein: M???

Frame 3, Start 18 -> Protein: M???

Longest Protein Sequence: M???

Longest Protein Sequence: M???

Header : Mouse_sequence

Original Sequence: TTGCGCGGATCGTAGCTAGCTAGCTAGCTAATGCXTA

Cleaned Sequence: TTGCGCGGATCGTAGCTAGCTAGCTAGCTAATGCTA

Transcribed Sequence(RNA): UUGCGCGGAUCGUAGCUAGCUAGCUAGCUAAUGCUA

Frame number & start position: []

ORFS: []

Longest ORF: {1: None, 2: None, 3: None}

Header : Plant_sequence

Original Sequence: GCTAGCTAGCATCGATCGTATAGCTAGCTAGCNNN

Cleaned Sequence: GCTAGCTAGCATCGATCGTATAGCTAGCTAGC

Transcribed Sequence(RNA): GCUAGCUAGCAUCGAUCGUAUAGCUAGCUAGC

Frame number & star