In [None]:
def read_fasta(filepath):
    sequences = {}
    with open(filepath, 'r') as file:
        header = None
        seq = ''
        for line in file:
            line = line.strip() # to remove whitespaces and stuff
            if line.startswith('>'):
                if header:
                    sequences[header] = seq
                header = line[1:]
                seq = ''
            else:
                seq += line
        if header:
            sequences[header] = seq
    return sequences

def nucleotide_composition(seq):
    composition = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    for nucleotide in seq:
        if nucleotide in composition:
            composition[nucleotide] += 1
    return composition

def dinucleotide_composition(seq):

    composition = {}
    for i in range(len(seq) - 1):
        dinucleotide = seq[i:i + 2]
        if dinucleotide in composition:
            composition[dinucleotide] += 1
        else:
            composition[dinucleotide] = 1
    return composition

def trinucleotide_composition(seq):
    composition = {}
    for i in range(len(seq) - 2):
        trinucleotide = seq[i:i + 3]
        if trinucleotide in composition:
            composition[trinucleotide] += 1
        else:
            composition[trinucleotide] = 1
    return composition

def find_orfs(seq):
    orfs = []
    start_codons = ['ATG']
    stop_codons = ['TAA', 'TAG', 'TGA']
    for frame in range(3):
        for i in range(frame, len(seq) - 2, 3):
            codon = seq[i:i + 3]
            if codon in start_codons:
                orf = codon
                for j in range(i + 3, len(seq) - 2, 3):
                    next_codon = seq[j:j + 3]
                    if next_codon in stop_codons:
                        orfs.append(orf + next_codon)
                        break
                    else:
                        orf += next_codon
    return orfs

def analyze_fasta(filepath):
    sequences = read_fasta(filepath)
    results = {}
    for header, seq in sequences.items():
        results[header] = {
            'mononucleotide': nucleotide_composition(seq),
            'dinucleotide': dinucleotide_composition(seq),
            'trinucleotide': trinucleotide_composition(seq),
            'orfs': find_orfs(seq)
        }
    return results

analysis_results = analyze_fasta('fasta_file.txt')

for header, data in analysis_results.items():
    print(f"Header: {header}")
    print("Mononucleotide Composition:", data['mononucleotide'])
    print("Dinucleotide Composition:", data['dinucleotide'])
    print("Trinucleotide Composition:", data['trinucleotide'])
    print("ORFs:", data['orfs'])
    print("-------------------------------------------------------------------------")

Header: NM_001300425.1 Drosophila melanogaster Akt kinase (Akt), transcript variant E, mRNA
Mononucleotide Composition: {'A': 1695, 'C': 1013, 'G': 987, 'T': 1174}
Dinucleotide Composition: {'CC': 207, 'CA': 389, 'AG': 330, 'GT': 205, 'TT': 361, 'TA': 347, 'AT': 396, 'TC': 225, 'CG': 204, 'GA': 314, 'AA': 645, 'TG': 241, 'GC': 256, 'AC': 324, 'CT': 212, 'GG': 212}
Trinucleotide Composition: {'CCA': 81, 'CAG': 87, 'AGT': 66, 'GTT': 71, 'TTA': 103, 'TAT': 110, 'ATC': 82, 'TCG': 57, 'CGA': 77, 'GAA': 115, 'AAA': 272, 'AAG': 117, 'GTG': 63, 'TGA': 71, 'AGC': 98, 'GCG': 46, 'CGT': 40, 'GTA': 34, 'TAA': 108, 'AAC': 133, 'ACT': 70, 'CTA': 59, 'ACG': 53, 'CGG': 40, 'GGT': 43, 'TAC': 70, 'CTT': 63, 'TTG': 68, 'TGG': 59, 'GGA': 72, 'AAT': 123, 'ATT': 100, 'TTT': 127, 'TGC': 55, 'GCC': 64, 'CCT': 43, 'CTG': 47, 'TGT': 56, 'GCT': 49, 'TTC': 63, 'TCT': 50, 'ATG': 63, 'GCA': 96, 'CAT': 89, 'CAA': 150, 'GAT': 74, 'AGG': 72, 'GGC': 56, 'AGA': 94, 'ACA': 141, 'CAC': 63, 'ACC': 60, 'CGC': 47, 'CCC': 35,