## Fasta Parsing Workflow

Functions: Fasta parsing, sequence cleaning, GC content calculation, reverse complement, ORF frame, unique mutation simulation, transcription, translation.

Workflow pipeline integrating all steps

In [29]:
import random

def fasta_parsing(file_path):
    fasta_dict= {}
    with open(file_path, "r") as file:
        header= None
        sequence_lines= []
        for line in file:
            line= line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header]= "".join(sequence_lines)
                header = line[1:]
                sequence_lines= []
            else:
                sequence_lines.append(line)
        if header:
            fasta_dict[header]= "".join(sequence_lines)
    return fasta_dict

def clean(seq):
    cleaned_sequence= "".join([base for base in seq.upper() if base in 'ATGC'])
    return cleaned_sequence

def calculate_gc(seq):
    gc= 0
    total= len(seq)
    for base in seq:
        if base in 'GC':
            gc +=1
    gc_content= round(gc/total * 100, 2)
    return gc_content


def reverse_complement(dna):
    complement= {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
    result= ""
    for base in dna:
        if base in complement:
            result += complement[base]
        else:
            result += base
    r_complement= result[::-1]
    return r_complement

def orf_frame(seq):
    i= 0
    start_codon= 'ATG'
    stop_codons= ['TAA', 'TGA', 'TAG']
    orfs= []
    longest_orf= ""
    
    while i < len(seq)-2:
        codon= seq[i:i+3]
        if codon == start_codon:
            for j in range(i+3, len(seq)-2, 3):
                stop_codon= seq[j:j+3]
                if stop_codon in stop_codons:
                    orf= seq[i:j+3]
                    if len(orf) > len(longest_orf):
                        longest_orf= orf
                    orfs.append(( i+1, j+3, orf))
                    i= j+3
                    break
            else:
                i +=3
        else:
            i +=3
    return longest_orf, orfs


def unique_mutation_simulation(seq, n):
    seq_list= list(seq)
    bases= ('A', 'T', 'G', 'C')
    mutated_positions= set()

    while len(mutated_positions) < n:
        position= random.randint(0, len(seq)-1)
        if position not in mutated_positions:
            original_base= seq_list[position]
            possible_outcomes= []
            for base in bases:
                if base != original_base:
                    possible_outcomes.append(base)
            new_base= random.choice(possible_outcomes)

            seq_list[position]= new_base
            mutated_positions.add(position)
    mutated_sequence= "".join(seq_list)
    return mutated_sequence


def transcription(dna):
    rna= ""
    for base in dna:
        if base == 'T':
            rna += 'U'
        else:
            rna += base
    return rna


def translation(rna):

    codon_table = {
    'UUU': 'Phenylalanine', 'UUC': 'Phenylalanine',
    'UUA': 'Leucine', 'UUG': 'Leucine',
    'CUU': 'Leucine', 'CUC': 'Leucine', 'CUA': 'Leucine', 'CUG': 'Leucine',
    'AUU': 'Isoleucine', 'AUC': 'Isoleucine', 'AUA': 'Isoleucine',
    'AUG': 'Methionine',  # Start codon
    'GUU': 'Valine', 'GUC': 'Valine', 'GUA': 'Valine', 'GUG': 'Valine',

    'UCU': 'Serine', 'UCC': 'Serine', 'UCA': 'Serine', 'UCG': 'Serine',
    'CCU': 'Proline', 'CCC': 'Proline', 'CCA': 'Proline', 'CCG': 'Proline',
    'ACU': 'Threonine', 'ACC': 'Threonine', 'ACA': 'Threonine', 'ACG': 'Threonine',
    'GCU': 'Alanine', 'GCC': 'Alanine', 'GCA': 'Alanine', 'GCG': 'Alanine',

    'UAU': 'Tyrosine', 'UAC': 'Tyrosine',
    'CAU': 'Histidine', 'CAC': 'Histidine',
    'CAA': 'Glutamine', 'CAG': 'Glutamine',
    'AAU': 'Asparagine', 'AAC': 'Asparagine',
    'AAA': 'Lysine', 'AAG': 'Lysine',
    'GAU': 'Aspartic acid', 'GAC': 'Aspartic acid',
    'GAA': 'Glutamic acid', 'GAG': 'Glutamic acid',

    'UGU': 'Cysteine', 'UGC': 'Cysteine',
    'UGG': 'Tryptophan',
    'CGU': 'Arginine', 'CGC': 'Arginine', 'CGA': 'Arginine', 'CGG': 'Arginine',
    'AGU': 'Serine', 'AGC': 'Serine',
    'AGA': 'Arginine', 'AGG': 'Arginine',
    'GGU': 'Glycine', 'GGC': 'Glycine', 'GGA': 'Glycine', 'GGG': 'Glycine',
    'UAG': 'Stop', 'UGA':'Stop', 'UAA':'Stop' # stop codons
}

    aminoacids= []
    codons= [rna[i:i+3] for i in range(0, len(rna)-2,3)]
    for codon in codons:
        aa= codon_table.get(codon, '?')
        if aa == "Stop":
            aminoacids.append(f"*")
            break
        aminoacids.append(aa)
    protein= "-".join(aminoacids)
    return protein

sequences= fasta_parsing("practice3.txt")
n= int(input("Number of mutations to occur:"))

cleaned_seq={}
for header, seq in sequences.items():
    cleaned_seq[header]= clean(seq)


with open("clean_practice_3.txt", "w") as file: # to save the cleaned file
    for header, seq in cleaned_seq.items():
        file.write(f">{header}\n{seq}\n")

for header, seq in cleaned_seq.items():
    print(f">{header}\n{seq}\n")
    
    
    gc = calculate_gc(seq)
    print(f"GC Content: {gc}%\n")
    
    r_complement= reverse_complement(seq)
    print(f"Reverse Complement: {r_complement}\n")

    longest_orf, _= orf_frame(seq)
    longest_orf_rna= transcription(longest_orf)
    print(f"Longest ORF RNA: {longest_orf_rna}\n")

    protein= translation(longest_orf_rna)
    print(f"Protein for the Longest ORF:{protein}\n")

    mutated_sequence= unique_mutation_simulation(seq, n)
    print(f"Original: {seq}\nMutated: {mutated_sequence}\nNumber of mutations: {n}\n")

    rna= transcription(seq)
    print(f"RNA: {rna}\n")

    protein= translation(rna)
    print(f"Protein: {protein}\n")       



Number of mutations to occur: 2


>Human_sequence
ATGCTAGCTAGCTAACGATGCTAGCTAGCTGAC

GC Content: 48.48%

Reverse Complement: GTCAGCTAGCTAGCATCGTTAGCTAGCTAGCAT

Longest ORF RNA: AUGCUAGCUAGCUAA

Protein for the Longest ORF:Methionine-Leucine-Alanine-Serine-*

Original: ATGCTAGCTAGCTAACGATGCTAGCTAGCTGAC
Mutated: ATGCTAGCTAGCTAGCGATGGTAGCTAGCTGAC
Number of mutations: 2

RNA: AUGCUAGCUAGCUAACGAUGCUAGCUAGCUGAC

Protein: Methionine-Leucine-Alanine-Serine-*

>Mouse_sequence
TTGCGCGGATCGTAGCTAGCTAGCTAGCTAATGCTA

GC Content: 50.0%

Reverse Complement: TAGCATTAGCTAGCTAGCTAGCTACGATCCGCGCAA

Longest ORF RNA: 

Protein for the Longest ORF:

Original: TTGCGCGGATCGTAGCTAGCTAGCTAGCTAATGCTA
Mutated: TTGCGAGGATCGCAGCTAGCTAGCTAGCTAATGCTA
Number of mutations: 2

RNA: UUGCGCGGAUCGUAGCUAGCUAGCUAGCUAAUGCUA

Protein: Leucine-Arginine-Glycine-Serine-*

>Plant_sequence
GCTAGCTAGCATCGATCGTATAGCTAGCTAGC

GC Content: 50.0%

Reverse Complement: GCTAGCTAGCTATACGATCGATGCTAGCTAGC

Longest ORF RNA: 

Protein for the Longest ORF:

Original: GCTAGCTAGCAT