In [2]:
import pandas as pd
from pathlib import Path

dataset_path = Path("/mnt/i/bio/mave_curation/Kotler_TP53_RelativeFitnessHCT116/")

In [3]:
data = pd.read_excel(dataset_path / "raw" / "mmc5.xlsx")

In [4]:
subset = data[(~data.AA_change.isna()) & (data.AA_change.str.len() == 3) & (data.Sec_AA_change.isna()) & (data.Mut_type != "DelAA")]

In [5]:
subset

Unnamed: 0,Number,Var_seq,Identifier,SubLib,Backbone,Mut_type,Position,Seq_change,Codon_num,AA_change,Sec_mut_position,Sec_seq_change,Sec_codon_num,Sec_AA_change,MutID,IARC_Desc,Silent,RFS_HCT116
26,6445,AATAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400063,DBDD,wt,AASub,736,ATG>AAT,246.0,M>N,,,,,,,False,0.157110
27,6446,AGTAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400064,DBDD,wt,AASub,736,ATG>AGT,246.0,M>S,,,,,,,False,0.422444
28,6447,CAGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400065,DBDD,wt,AASub,736,ATG>CAG,246.0,M>Q,,,,,,,False,0.439412
29,6448,CCGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400066,DBDD,wt,AASub,736,ATG>CCG,246.0,M>P,,,,,,,False,-0.247905
30,6449,GAGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,400067,DBDD,wt,AASub,736,ATG>GAG,246.0,M>E,,,,,,,False,0.314707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3208,9821,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403439,DBDD,wt,Sub,875,A>C,292.0,K>T,,,,,,,False,-0.919863
3209,9822,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403440,DBDD,wt,Sub,875,A>G,292.0,K>R,,,,,,,False,-1.260542
3210,9823,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403441,DBDD,wt,Sub,875,A>T,292.0,K>I,,,,,,,False,-0.790598
3214,9827,ATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCA...,403445,DBDD,wt,Sub,876,A>C,292.0,K>N,,,,,,,False,-0.985578


In [6]:
from Bio.PDB.Polypeptide import protein_letters_3to1
protein_letters_1to3 = {v: k.title() for k, v in protein_letters_3to1.items()}
protein_letters_1to3["*"] = "Ter"
def make_hgvs_pro(r):
    return f"p.{protein_letters_1to3[r.AA_change[0]]}{int(r.Codon_num)}{protein_letters_1to3[r.AA_change[-1]]}"

In [7]:
subset = subset.assign(hgvs_pro=subset.apply(make_hgvs_pro, axis=1))

In [8]:
subset = subset.rename(columns={"RFS_HCT116":"score"})

In [9]:
aggregated = subset.groupby("hgvs_pro").agg({"score":"mean"}).reset_index()

In [10]:
aggregated

Unnamed: 0,hgvs_pro,score
0,p.Ala276Ala,-0.984566
1,p.Ala276Arg,-0.167678
2,p.Ala276Asn,-0.108430
3,p.Ala276Asp,-0.736095
4,p.Ala276Cys,-1.157606
...,...,...
789,p.Val274Ser,0.165487
790,p.Val274Ter,0.474386
791,p.Val274Thr,0.024092
792,p.Val274Tyr,0.388258


In [11]:
aggregated.to_csv(dataset_path / "scoreset.csv", index=False)