In [2]:
from pathlib import Path
import pandas as pd
import json
import mavehgvs
import os
import numpy as np
from tqdm import tqdm
tqdm.pandas()
dataset_path = Path("/data/dzeiberg/mave_calibration/data/Weng_KRAS/")
assert dataset_path.exists()

In [3]:
data = pd.read_excel(dataset_path / "raw" / "41586_2023_6954_MOESM5_ESM.xlsx", sheet_name="TableS4")

In [15]:
from Bio.PDB.Polypeptide import one_to_index, index_to_three
one_to_three = lambda x: index_to_three(one_to_index(x)) if x != "*" else "Ter"

def get_differences(reference: str, alternate: str):
    if len(reference) != len(alternate):
        return ""
    differences = []
    
    # Compare character by character in both strings
    for i in range(len(reference)):
        if reference[i] != alternate[i]:
            differences.append((reference[i], i + 1, alternate[i]))
    
    # # Check for any extra characters in the longer string
    # if len(reference) > min_len:
    #     for i in range(min_len, len(reference)):
    #         differences.append((reference[i], i + 1, '-'))  # '-' indicates missing character in alternate
    # elif len(alternate) > min_len:
    #     for i in range(min_len, len(alternate)):
    #         differences.append(('-', i + 1, alternate[i]))  # '-' indicates missing character in reference
    differences = ["p.{}{}{}".format(one_to_three(x).title(), y, one_to_three(z).title()) for x, y, z in differences]
    if len(differences) == 1:
        return differences[0]
    return ";".join(differences)

In [16]:
get_differences(data.loc[0,"aa_seq"], data.loc[1,'aa_seq'])

'p.Thr1Lys'

In [17]:
data = data.assign(hgvs_pro = data.progress_apply(lambda x: get_differences(data.loc[0,"aa_seq"], x['aa_seq']), axis=1))

100%|██████████| 191752/191752 [00:04<00:00, 45753.31it/s]


In [19]:
data[~data.hgvs_pro.str.contains(";",regex=False)]

Unnamed: 0,block,aa_seq,Nham_aa,WT,fitness,sigma,growthrate,growthrate_sigma,nor_gr,nor_gr_sigma,nor_fitness,nor_fitness_sigma,assay,hgvs_pro
0,block1,TEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,0,1.0,-0.003326,0.024214,0.100360,0.002178,0.100360,0.002178,-0.003326,0.024214,AbundancePCA,
1,block1,KEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,0.100104,0.024718,0.109664,0.002223,0.109664,0.002223,0.100104,0.024718,AbundancePCA,p.Thr1Lys
2,block1,TKYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,0.047686,0.171810,0.104949,0.015454,0.104949,0.015454,0.047686,0.171810,AbundancePCA,p.Glu2Lys
3,block1,TNYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-0.525653,0.133890,0.053377,0.012044,0.053377,0.012044,-0.525653,0.133890,AbundancePCA,p.Glu2Asn
4,block1,TTYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-0.034706,0.069509,0.097538,0.006252,0.097538,0.006252,-0.034706,0.069509,AbundancePCA,p.Glu2Thr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180188,block1,*EYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-1.031671,0.101248,-0.007733,0.011480,-0.007733,0.011480,-1.031671,0.101248,BindingPCA RAF1RBD coexpression GAP,p.Thr1Ter
180189,block1,YEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-0.192284,0.088852,0.087439,0.010074,0.087439,0.010074,-0.192284,0.088852,BindingPCA RAF1RBD coexpression GAP,p.Thr1Tyr
180190,block1,WEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-0.231918,0.045826,0.082945,0.005196,0.082945,0.005196,-0.231918,0.045826,BindingPCA RAF1RBD coexpression GAP,p.Thr1Trp
180191,block1,CEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-0.016761,0.041797,0.107341,0.004739,0.107341,0.004739,-0.016761,0.041797,BindingPCA RAF1RBD coexpression GAP,p.Thr1Cys


In [9]:
wt_seq = data.loc[0,'aa_seq']

In [25]:
from Bio.PDB.Polypeptide import one_to_index,index_to_three

def one2three(c):
    if c == "*":
        return "Ter"
    return index_to_three(one_to_index(c)).title()

def find_variants(wt_seq : str, alt_seq : str):
    wt = np.array(list(wt_seq))
    alt = np.array(list(alt_seq))
    if len(wt) != len(alt):
        return "-"
    locations = np.where(wt != alt)[0]
    variants = []
    for loc in locations:
        variant = f"p.{one2three(wt[loc])}{loc+1}{one2three(alt[loc])}"
        variants.append(variant)
    return ";".join(variants)

In [26]:
data = data.assign(hgvs_pro=lambda x: x.progress_apply(lambda y: find_variants(wt_seq,y['aa_seq']),axis=1))

  0%|          | 0/191752 [00:00<?, ?it/s]

100%|██████████| 191752/191752 [00:17<00:00, 10727.22it/s]


In [33]:
data

Unnamed: 0,block,aa_seq,Nham_aa,WT,fitness,sigma,growthrate,growthrate_sigma,nor_gr,nor_gr_sigma,nor_fitness,nor_fitness_sigma,assay,hgvs_pro
0,block1,TEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,0,1.0,-0.003326,0.024214,0.100360,0.002178,0.100360,0.002178,-0.003326,0.024214,AbundancePCA,
1,block1,KEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,0.100104,0.024718,0.109664,0.002223,0.109664,0.002223,0.100104,0.024718,AbundancePCA,p.Thr1Lys
2,block1,TKYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,0.047686,0.171810,0.104949,0.015454,0.104949,0.015454,0.047686,0.171810,AbundancePCA,p.Glu2Lys
3,block1,TNYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-0.525653,0.133890,0.053377,0.012044,0.053377,0.012044,-0.525653,0.133890,AbundancePCA,p.Glu2Asn
4,block1,TTYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,1,,-0.034706,0.069509,0.097538,0.006252,0.097538,0.006252,-0.034706,0.069509,AbundancePCA,p.Glu2Thr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191747,block1,FEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVID...,2,,0.364996,0.107613,0.150625,0.012201,0.150625,0.012201,0.364996,0.107613,BindingPCA RAF1RBD coexpression GAP,p.Thr1Phe;p.Gln60Leu
191748,block1,FEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIECSYRKQVVID...,2,,-0.856652,0.248108,0.012111,0.028131,0.012111,0.028131,-0.856652,0.248108,BindingPCA RAF1RBD coexpression GAP,p.Thr1Phe;p.Asp37Cys
191749,block1,FEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIGDSYRKQVVID...,2,,-1.072136,0.396901,-0.012321,0.045002,-0.012321,0.045002,-1.072136,0.396901,BindingPCA RAF1RBD coexpression GAP,p.Thr1Phe;p.Glu36Gly
191750,block1,FEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPSIEDSYRKQVVID...,2,,-0.908184,0.200493,0.006268,0.022733,0.006268,0.022733,-0.908184,0.200493,BindingPCA RAF1RBD coexpression GAP,p.Thr1Phe;p.Thr34Ser


In [34]:
data.assay.value_counts()

assay
AbundancePCA                           27615
BindingPCA RAF1RBD                     26820
BindingPCA RALGDSRBD                   22771
BindingPCA SOS1                        22096
BindingPCA DARPin K27                  22096
BindingPCA DARPin K55                  22096
BindingPCA PIK3CGRBD                   21982
BindingPCA full length RAF1            13402
BindingPCA RAF1RBD coexpression GAP    12874
Name: count, dtype: int64