# Single Point Mutation + GC Analysis Workflow

Functions: file parsing, sequence cleaning, GC calculation, and unique single point mutation.

Used random to select mutation positions.

Processed a mini FASTA/text file.

Stored results in a Pandas DataFrame and exported to CSV.

In [2]:
import pandas as pd
import random

def fasta_parsing(practice3txt):
    fasta_dict= {}
    with open("practice3.txt") as file:
        header= None
        sequence_lines= []
        for line in file:
            line= line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header]= "".join(sequence_lines)
                header= line[1:]
                sequence_lines= []
            else:
                sequence_lines.append(line)
        if header:
            fasta_dict[header]= "".join(sequence_lines)
    return fasta_dict


def clean(seq):
    cleaned_seq= "".join([base for base in seq.upper() if base in 'ATCG'])
    return cleaned_seq

def calculate_gc(seq):
    gc= 0
    for base in seq.upper():
        if base in 'GC':
            gc += 1
    gc_percent= round(gc/len(seq) * 100, 2) if len(seq) > 0 else 0
    return gc_percent

def unique_mutation(seq, n):
    seq_list= list(seq)
    bases= ('A', 'T', 'C', 'G')
    mutated_position= set()
    while len(mutated_position) < n:
        position= random.randint(0, len(seq)-1)
        if position not in mutated_position:
            original_base= seq_list[position]
            
            possible_outcomes= []
            for base in bases:
                if base != original_base:
                    possible_outcomes.append(base)
            new_base= random.choice(possible_outcomes)
            seq_list[position]= new_base
            mutated_position.add(position)
    mutated_sequence= "".join(seq_list)
    return mutated_sequence

sequences= fasta_parsing("practice3.txt")
n = int(input("Enter the numbers of mutations you want to occur:"))

cleaned_sequences= {}
for header, seq in sequences.items():
    cleaned_sequences[header]= clean(seq)


data= []
for header, seq in cleaned_sequences.items():
    mutated_sequence= unique_mutation(seq, n)
    data.append({
        "Header": header,
        "Sequence": seq,
        "Length": len(seq),
        "GC_Content(%)": calculate_gc(seq),
        "Mutated_Sequence": mutated_sequence,
        "Mutated_GC_Content(%)": calculate_gc(mutated_sequence)
    })

   
        
            
df= pd.DataFrame(data)
print(df)
            
df.to_csv("unique_mutation.csv", index= False)    
    

Enter the numbers of mutations you want to occur: 3


Original GC: 48.48
Original GC: 50.0
Original GC: 50.0
           Header                              Sequence  Length  \
0  Human_sequence     ATGCTAGCTAGCTAACGATGCTAGCTAGCTGAC      33   
1  Mouse_sequence  TTGCGCGGATCGTAGCTAGCTAGCTAGCTAATGCTA      36   
2  Plant_sequence      GCTAGCTAGCATCGATCGTATAGCTAGCTAGC      32   

   GC_Content(%)                      Mutated_Sequence  Mutated_GC_Content(%)  
0          48.48     ATGCTAGCTAGCTAACGGTGCTAGCTAGCTAAG                  48.48  
1          50.00  TTGCGAGGATCGTAGATAGGTAGCTAGCTAATGCTA                  44.44  
2          50.00      GCTAGCCAGCATCGATCGTATAGCCAGCTATC                  53.12  
