In [5]:

from collections import Counter

def parse_fasta_file(input_filepath, max_lines=5000):  
    with open(input_filepath) as input_file:
        label = None
        seq = ""

        for line in input_file:
            line = line.strip()
            if line.startswith(">"):
                if label is not None and len(seq) > 300:
                    yield label, seq
                label = line[1:]  
                seq = ""
            else:
                seq += line

        if label is not None and len(seq) > 300:
            yield label, seq



def calc_gc_content(seq):
    seq = seq.upper()
    g_count = seq.count("G")
    c_count = seq.count("C")
    gc_total = g_count + c_count
    return gc_total / len(seq) if len(seq) > 0 else 0.0






def analyze_fasta(input_filepath):
    fasta_entries = parse_fasta_file(input_filepath, max_lines=5000) 

    print("Here are the results:\n")

    for label, seq in fasta_entries:
        seq = seq.replace("\n", "").replace(" ", "")
        seq_length = len(seq)
        gc = calc_gc_content(seq)

        print(f"Label: {label.strip()}")
        print(f"Total length = {seq_length} | GC content: {gc:.2%}")
        print(f"Sequence length (bp): {seq_length}")
       


input_filepath = "isolate_A/Isolate_A_annotated.fasta"
analyze_fasta(input_filepath)



Here are the results:

Label: DHA2_10013-t26_1 | organism=Giardia_Assemblage_A2_isolate_DH | product=Hypothetical protein | location=AHGT01000017:3703-4365(+) | length=663 | sequence_SO=contig | SO=protein_coding_gene
Total length = 663 | GC content: 48.11%
Sequence length (bp): 663
Label: DHA2_10016-t26_1 | organism=Giardia_Assemblage_A2_isolate_DH | product=Hypothetical protein | location=AHGT01000017:6342-7277(-) | length=936 | sequence_SO=contig | SO=protein_coding_gene
Total length = 936 | GC content: 50.75%
Sequence length (bp): 936
Label: DHA2_10019-t26_1 | organism=Giardia_Assemblage_A2_isolate_DH | product=Phospholipid-transporting ATPase | location=AHGT01000017:9901-14583(-) | length=4683 | sequence_SO=contig | SO=protein_coding_gene
Total length = 4683 | GC content: 48.45%
Sequence length (bp): 4683
Label: DHA2_10025-t26_1 | organism=Giardia_Assemblage_A2_isolate_DH | product=Hypothetical protein | location=AHGT01000024:115371-116096(+) | length=726 | sequence_SO=contig | SO