In [6]:
import pandas as pd

df = pd.read_excel("41467_2024_45594_MOESM4_ESM.xlsx")
df.head()

Unnamed: 0,Amino Acid,EGFR Position,Z-score,Intracellular_Extracellular,Domain,p-value
0,I643C,643,6.600609,Extracellular,domain IV,0.194558
1,K642C,642,6.541242,Extracellular,domain IV,0.140137
2,Q218C,218,6.47343,Extracellular,domain II,0.154892
3,E709W,709,6.271532,Intracellular,kinase,0.165503
4,S229C,229,6.162138,Extracellular,domain II,0.063936


In [4]:
wt_seq = "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACIDRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPSRDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGSTAENAEYLRVAPQSSEFIGA"

In [10]:
df["wt_seq"] = wt_seq

In [11]:
def mutate_seq(mutation, wt_seq):
    """
    mutation is I643C, which first letter is aminno acid of WT, 643 is aacid poisiont C is the new mutated seuqence

    return the mutated_seq 

    """
    # Parse mutation string
    wt_aa = mutation[0]  # Wild-type amino acid
    mut_aa = mutation[-1]  # Mutated amino acid
    position = int(mutation[1:-1])  # Position (1-indexed)
    
    # Verify wild-type amino acid matches the sequence
    if wt_seq[position-1] != wt_aa:
        raise ValueError(f"Wild-type amino acid at position {position} is {wt_seq[position-1]}, not {wt_aa}")
    
    # Make the mutation
    mutated_seq = wt_seq[:position-1] + mut_aa + wt_seq[position:]
    
    return mutated_seq

In [17]:
# Create a new column 'mutated_seq' by applying mutate_seq to each row
df["mut_seq"] = df.apply(
    lambda row: mutate_seq(row["Amino Acid"], row["wt_seq"]) 
                if pd.notna(row["Amino Acid"]) and isinstance(row["Amino Acid"], str) 
                else None, 
    axis=1
)

In [26]:
df.head()
dataset = df[["wt_seq","mut_seq","Z-score"]]
# Drop any row containing NA or empty string in any column for data integrity
dataset = dataset.dropna()
dataset = dataset[~dataset.isin(['']).any(axis=1)]

In [27]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.1)

In [28]:
train.to_csv("EGFR_train.csv", index=False)
test.to_csv("EGFR_test.csv", index=False)