In [1]:
import pandas as pd
from pathlib import Path

dataset_path = Path("/data/dzeiberg/mave_calibration/data/Kotler_TP53_RelativeFitnessHCT116/")
assert dataset_path.exists()

In [2]:
data = pd.read_excel(dataset_path / "raw" / "mmc5.xlsx")

In [3]:
data.Mut_type.value_counts()

Mut_type
HSSupp                  1019
AASub                    614
HSComb                   452
Sub                      407
Ins                      399
Del                      102
IARC.InDel.GeneVars       65
DelAA                     43
Del2AA                    41
STOP                      31
SNP_HS_Comb               26
Sub2bp                    16
IARC.InDel.SomatMuts       2
wtp53_DBDD_seq             1
Name: count, dtype: int64

In [4]:
data[data.Silent]

Unnamed: 0,Number,Var_seq,Identifier,SubLib,Backbone,Mut_type,Position,Seq_change,Codon_num,AA_change,Sec_mut_position,Sec_seq_change,Sec_codon_num,Sec_AA_change,MutID,IARC_Desc,Silent,RFS_HCT116
41,6460,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400078,DBDD,wt,wtp53_DBDD_seq,,,,,,,,,,,True,-1.146098
93,6512,ATGAATCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400130,DBDD,wt,Sub,741,C>T,247.0,N>N,,,,,,,True,-0.843581
106,6525,ATGAACAGAAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400143,DBDD,wt,AASub,742,CGG>AGA,248.0,R>R,,,,,,,True,-0.650183
150,6573,ATGAACAGGAGACCCATCCTCACCATCATCACACTGGAAGACTCCA...,400191,DBDD,wt,HSComb,742,CGG>AGG,248.0,R>R,745.0,AGG>AGA,249.0,R>R,,,True,-1.190198
155,6578,ATGAACAGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400196,DBDD,wt,HSComb,742,CGG>AGG,248.0,R>R,817.0,CGT>CGA,273.0,R>R,,,True,-0.467684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,9587,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403205,DBDD,wt,Sub,855,G>A,285.0,E>E,,,,,,,True,-1.477308
3072,9619,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403237,DBDD,wt,Sub,858,A>G,286.0,E>E,,,,,,,True,-0.994999
3105,9652,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403270,DBDD,wt,Sub,861,G>A,287.0,E>E,,,,,,,True,-1.108044
3141,9689,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403307,DBDD,wt,Sub,864,T>C,288.0,N>N,,,,,,,True,-0.959786


In [5]:
subset = data.loc[(data.Mut_type.isin({'AASub','Sub'})) | data.Silent]

In [6]:
from Bio.PDB.Polypeptide import protein_letters_3to1
protein_letters_1to3 = {v: k.title() for k, v in protein_letters_3to1.items()}
protein_letters_1to3["*"] = "Ter"
def make_hgvs_pro(r):
    try:
        return f"p.{protein_letters_1to3[r.AA_change[0]]}{int(r.Codon_num)}{protein_letters_1to3[r.AA_change[-1]]}"
    except TypeError as e:
        return ""

In [7]:
subset = subset.assign(hgvs_pro=subset.apply(make_hgvs_pro, axis=1))

In [9]:
subset = subset[subset.hgvs_pro != ""]

In [10]:
subset[subset.hgvs_pro.str.slice(2,5) == subset.hgvs_pro.str.slice(-3)]

Unnamed: 0,Number,Var_seq,Identifier,SubLib,Backbone,Mut_type,Position,Seq_change,Codon_num,AA_change,Sec_mut_position,Sec_seq_change,Sec_codon_num,Sec_AA_change,MutID,IARC_Desc,Silent,RFS_HCT116,hgvs_pro
93,6512,ATGAATCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400130,DBDD,wt,Sub,741,C>T,247.0,N>N,,,,,,,True,-0.843581,p.Asn247Asn
106,6525,ATGAACAGAAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400143,DBDD,wt,AASub,742,CGG>AGA,248.0,R>R,,,,,,,True,-0.650183,p.Arg248Arg
150,6573,ATGAACAGGAGACCCATCCTCACCATCATCACACTGGAAGACTCCA...,400191,DBDD,wt,HSComb,742,CGG>AGG,248.0,R>R,745.0,AGG>AGA,249.0,R>R,,,True,-1.190198,p.Arg248Arg
155,6578,ATGAACAGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400196,DBDD,wt,HSComb,742,CGG>AGG,248.0,R>R,817.0,CGT>CGA,273.0,R>R,,,True,-0.467684,p.Arg248Arg
156,6579,ATGAACAGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400197,DBDD,wt,HSComb,742,CGG>AGG,248.0,R>R,817.0,CGT>CGC,273.0,R>R,,,True,-0.158726,p.Arg248Arg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,9587,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403205,DBDD,wt,Sub,855,G>A,285.0,E>E,,,,,,,True,-1.477308,p.Glu285Glu
3072,9619,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403237,DBDD,wt,Sub,858,A>G,286.0,E>E,,,,,,,True,-0.994999,p.Glu286Glu
3105,9652,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403270,DBDD,wt,Sub,861,G>A,287.0,E>E,,,,,,,True,-1.108044,p.Glu287Glu
3141,9689,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403307,DBDD,wt,Sub,864,T>C,288.0,N>N,,,,,,,True,-0.959786,p.Asn288Asn


In [11]:
subset = subset.rename(columns={"RFS_HCT116":"score"})

In [12]:
subset.to_csv(dataset_path / "scoreset.csv", index=False)