In [9]:
seq = 'MDAPRQVVNFGPGPAKLPHSVLLEIQKELLDYKGVGISVLEMSHRSSDFAKIINNTENLVRELLAVPDNYKVIFLQGGGCGQFSAVPLNLIGLKAGRCADYVVTGAWSAKAAEEAKKFGTINIVHPKLGSYTKIPDPSTWNLNPDASYVYYCANETVHGVEFDFIPDVKGAVLVCDMSSNFLSKPVDVSKFGVIFAGAQKNVGSAGVTVVIVRDDLLGFALRECPSVLEYKVQAGNSSLYNTPPCFSIYVMGLVLEWIKNNGGAAAMEKLSSIKSQTIYEIIDNSQGFYVCPVEPQNRSKMNIPFRIGNAKGDDALEKRFLDKALELNMLSLKGHRSVGGIRASLYNAVTIEDVQKLAAFMKKFLEMHQL'


In [10]:
seq[14]

'A'

In [12]:
import pandas as pd
import re

# 1. PASTE YOUR WILD TYPE SEQUENCE HERE
# (I am using a placeholder; replace it with your actual sequence string)

# 2. Amino Acid Mapping (3-letter to 1-letter)
aa_map = {
    'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C', 'Gln': 'Q', 'Glu': 'E', 
    'Gly': 'G', 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 
    'Pro': 'P', 'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'
}

def get_mutated_sequence(wt_seq, mut_str):
    try:
        # Clean the string (remove 'p.')
        clean_mut = mut_str.replace("p.", "")
        
        # Parse: (Original AA)(Position)(Target AA)
        # Matches 3 letters, then digits, then 3 letters
        match = re.match(r"([a-zA-Z]{3})(\d+)([a-zA-Z]{3})", clean_mut)
        
        if match:
            orig_aa_3, pos_str, target_aa_3 = match.groups()
            pos = int(pos_str)
            
            # Get the 1-letter code for the target mutation
            new_aa = aa_map.get(target_aa_3)
            
            if new_aa:
                # Replace the amino acid at the specific index (pos-1 for 0-based indexing)
                # Logic: [Everything before] + [New AA] + [Everything after]
                variant_seq = wt_seq[:pos-1] + new_aa + wt_seq[pos:]
                return variant_seq
    except Exception as e:
        return f"Error: {e}"
    return None

# 3. Load your original dataset
# Replace 'input_data.csv' with the actual name of your file
df = pd.read_csv('data_orig.csv')

# 4. Create the new columns
results = []

for mut in df['hgvs_pro']:
    mutated_seq = get_mutated_sequence(seq, mut)
    results.append({
        'Mutation': mut,
        'Full_Sequence': mutated_seq
    })

# 5. Save to a new CSV
output_df = pd.DataFrame(results)
output_df.to_csv('mutated_sequences_output.csv', index=False)

print("File 'mutated_sequences_output.csv' has been created successfully.")

File 'mutated_sequences_output.csv' has been created successfully.
