## WEEK 1 (August 5-11,2025): Bioinformatics File Formats + Python File Parsing

Mini Project Questions: FASTA Sequence Analyzer

Write a Python program to do the following:

    Read a multi-FASTA file from disk (e.g., "fasta.txt")

    Parse the FASTA file into a dictionary where keys are sequence headers and values are sequences

    Clean sequences to include only valid DNA bases (A, T, G, C)

    Calculate and print the length and GC content (%) of each sequence

    Calculate and print the average length of all sequences combined

    Find and print the header, length, and GC content of the longest sequence

Bonus:

    Print the actual nucleotide sequence of the longest sequence (optional)



In [19]:
fastafile= """
>seq1 Homo sapiens gene A
ATGCGTAGCTAGTCGATCGATCGATCGATCGA
>seq2 Mus musculus gene B
GGCGGCGTTAGCTAGCTAGGCTAGCTAGCTAG"""


with open("sample.fasta", "w") as file:
    file.write(fastafile)


In [22]:
with open("sample.fasta", "r") as file:
    header_count = 0
    for line in file:
        line= line.strip()
        if line.startswith('>'):
            header_count +=1 
    print(header_count)


2


In [25]:
dna= """
>sequence_1
ATGCTAGCTAGCTACGATCGATCGTAGCTAGCTAGCATCG
ATCGATGCTAGCTAGCTAGCATCGATGCTAGCTAGCATCG
>sequence_2
GGCATCGATCGATCGATCGATCGGATCGATCGATCGATCG
CGATCGATCGATCGATCGATCGGATCGATCGATCGATCGA"""

with open("fasta.txt", "w") as file:
    file.write(dna.strip())



In [39]:
def fasta_seq(fastatxt):
    fasta_dict= {}
    with open("fasta.txt", "r") as file:
        header= None
        sequence_lines= []
        for line in file:
            line= line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header]= "".join(sequence_lines)
                header= line[1:]
                sequence_lines= []

            else:
                sequence_lines.append(line)
        if header:
            fasta_dict[header]= "".join(sequence_lines)
    return fasta_dict

def clean(sequence_lines):
    cleaned_sequences= "".join([base for base in sequence_lines if base in 'ACGT'])
    return cleaned_sequences

def calculate_gc(cleaned_sequences):
    gc=0
    for base in cleaned_sequences:
        if base in 'GC':
            gc += 1
    gc_percent = round(gc/ len(cleaned_sequences) * 100, 2)
    return gc_percent

sequences= fasta_seq("fasta.txt")

for header, seq in sequences.items():
    cleaned= clean(seq)
    gc= calculate_gc(cleaned)
    print(f"{header} - Length: {len(cleaned)} - GC%: {gc}")  
     

sequence_1 - Length: 80 - GC%: 50.0
sequence_2 - Length: 80 - GC%: 53.75


In [61]:
def fasta_seq(fastatxt):
    fasta_dict= {}  # dictionary to store the header and seqs merged into a single sequence
    with open("fasta.txt", "r") as file:
        header= None  # this shows that no previous header before sequence_1
        sequence_lines= [] # this is for merging/appending the sequences belonging to a particular 
                            # header into a single sequence
        for line in file:
            line= line.strip()
            if line.startswith('>'):
                if header: # this answers the question if there is a previous header known
                    fasta_dict[header]= "".join(sequence_lines) # this answers what to do to the previous
                                                                # header which is to save it
                header= line[1:]  # this step tells what to do if there is None
                sequence_lines= []
            else:
                sequence_lines.append(line) 

        if header:
            fasta_dict[header]= "".join(sequence_lines) # this steps shows how to add the last header not 
                                                        # to be omitted
    return fasta_dict

def clean(sequence_lines):
    cleaned_sequences= "".join([base for base in sequence_lines if base in 'ATGC'])
    return cleaned_sequences

def calculate_gc(cleaned_sequences):
    gc= 0
    for base in cleaned_sequences.upper():
        if base in 'GC':
            gc +=1
    gc_percent= round(gc/len(cleaned_sequences) *100, 2)
    return gc_percent

def avg_length(fasta_dict):
    lengths= []
    for each_sequence in fasta_dict.values():
        lengths.append(len(each_sequence))
    average= round(sum(lengths)/len(lengths), 2)
    return average

def find_longest_sequence(fasta_dict):
    longest_sequence= ""
    max_length = 0
    for header, seq in fasta_dict.items():
        if len(seq)> max_length:
            max_length = len(seq)
            longest_sequence = header
          
    return longest_sequence, max_length
        
    

sequences= fasta_seq("fasta.txt")

for header, seq in sequences.items():
    cleaned= clean(seq)
    gc = calculate_gc(cleaned)
    print(f"{header} - Length: {len(cleaned)} - GC%: {gc}")


average = avg_length(sequences)
print(f"Average Sequence Length:{average}")

longest_sequence, max_length= find_longest_sequence(sequences)
gc = calculate_gc(cleaned)
print(f"Longest Sequence: \nHeader: {header}\nLength: {max_length}\nGC%: {gc}")


sequence_1 - Length: 80 - GC%: 50.0
sequence_2 - Length: 80 - GC%: 53.75
Average Sequence Length:80.0
Longest Sequence: 
Header: sequence_2
Length: 80
GC%: 53.75
