In [37]:
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import matplotlib.pyplot as plt
import os


#Insert File Path
file_path = "data/protein_example.fasta"

#Create folder to save all plots
os.makedirs("plots", exist_ok=True)

#Iteration through FASTA file
for record in SeqIO.parse(file_path, "fasta"):

    #Translate all to uppercase to avoid errors
    seq_str = str(record.seq).upper()
    
    #Determine DNA or Protein sequence (if DNA translate)
    dna_letters = set("ATGC")
    
    if set(seq_str) <= dna_letters:
        
        print(f"Detected DNA sequence in {record.id} â†’ Translating...")
        
        #Translate
        seq_str = record.seq.translate(to_stop=True) 
        seq_str = str(seq_str)
        print(f"Translated Protein Length: {len(seq_str)} aa")
    else:
        print(f"Detected Protein sequence in {record.id}")


    #run analsyis
    analysis = ProteinAnalysis(seq_str)

    #Using Bio package to determine statistics
    aa_comp = analysis.get_amino_acids_percent()
    mw = analysis.molecular_weight()
    pi = analysis.isoelectric_point()
    length = len(seq_str)

    #Plotting amino acid comp
    #Save plot in file with correct name
    plt.figure(figsize=(10, 4))
    plt.bar(aa_comp.keys(), aa_comp.values())
    plt.title(f"Amino Acid Composition: {record.id}")
    plt.xlabel("Amino Acid")
    plt.ylabel("Frequency (Fraction)")
    plt.xticks(rotation=90)
    plt.tight_layout()

    plt.savefig(f"plots/{record.id}_aa_composition.png")
    plt.close()
    

    # Print human readable results
    print("=" * 45)
    print(f"Protein ID: {record.id}")
    print(f"Final Protein Length: {length} aa")
    print(f"Molecular Weight: {mw:.2f} Da")
    print(f"Isoelectric Point (pI): {pi:.2f}")
    print("\nAmino Acid Composition (%):")

    for aa, perc in aa_comp.items():
        print(f"  {aa}: {perc*100:.2f}%")

    print("=" * 45)

Detected Protein sequence in protein_1
Protein ID: protein_1
Final Protein Length: 37 aa
Molecular Weight: 4518.16 Da
Isoelectric Point (pI): 9.70

Amino Acid Composition (%):
  A: 5.41%
  C: 0.00%
  D: 5.41%
  E: 0.00%
  F: 8.11%
  G: 2.70%
  H: 5.41%
  I: 10.81%
  K: 8.11%
  L: 5.41%
  M: 2.70%
  N: 0.00%
  P: 2.70%
  Q: 10.81%
  R: 5.41%
  S: 8.11%
  T: 5.41%
  V: 2.70%
  W: 2.70%
  Y: 8.11%
Detected Protein sequence in protein_2
Protein ID: protein_2
Final Protein Length: 35 aa
Molecular Weight: 3786.08 Da
Isoelectric Point (pI): 8.64

Amino Acid Composition (%):
  A: 0.00%
  C: 0.00%
  D: 2.86%
  E: 0.00%
  F: 2.86%
  G: 17.14%
  H: 20.00%
  I: 2.86%
  K: 2.86%
  L: 2.86%
  M: 2.86%
  N: 2.86%
  P: 5.71%
  Q: 5.71%
  R: 2.86%
  S: 14.29%
  T: 2.86%
  V: 8.57%
  W: 0.00%
  Y: 2.86%
