# htFuncLib 01
## Prepare htFuncLib jobs based on neghborhoods.

#### Welcome to htFuncLib!
The purpose of this notebook is to run the htFuncLib algorithm.
It assumes that you have already downloaded the PSSM, refined structure and resfiles from the [FuncLib](https://funclib.weizmann.ac.il/bin/steps) webserver.

htFuncLib consists of these steps:
1. imports and initial setup
2. setting up initial parameters
3. create the bubbles
4. choose ∆∆G
5. prepare and run design jobs
6. run all jobs and aggregate scores

once finished, switch to 02_htFuncLib_aggregation_and_analysis.ipynb

Note: you will have to run these jobs yourself. this requires Rosetta and a computaitonal cluster. some changes to the jobs are probably required to accomodate different systems.

## 01 imports and initial setup

In [1]:
import os
import random
from copy import deepcopy
from glob import glob
from itertools import product
from pathlib import Path
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from Bio.PDB.PDBParser import PDBParser
from tqdm.notebook import tqdm

In /home/labs/fleishman/jonathaw/Code/anaconda3/envs/dees-seq-analysis/lib/python3.9/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/labs/fleishman/jonathaw/Code/anaconda3/envs/dees-seq-analysis/lib/python3.9/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/labs/fleishman/jonathaw/Code/anaconda3/envs/dees-seq-analysis/lib/python3.9/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /home/labs/fleishman/jonathaw/Code/anaconda3/envs/dees-seq-analysis/lib/python3.9/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle

In [102]:
def display_all(df, n=1000):
    with pd.option_context("display.max_rows", n, "display.max_columns", 1000):
        display(df)

In [2]:
random.seed(42)

# load basic parameters, this requires no changes
AAs = list("ACDEFGHIKLMNPQRSTVWY")

BB_ATOMS = ["N", "CA", "C", "O"]
DDGS = np.arange(0, 6.5, 0.5)
ONE_2_THREE = {
    "A": "ALA",
    "C": "CYS",
    "D": "ASP",
    "E": "GLU",
    "F": "PHE",
    "G": "GLY",
    "H": "HIS",
    "I": "ILE",
    "K": "LYS",
    "L": "LEU",
    "M": "MET",
    "N": "ASN",
    "P": "PRO",
    "Q": "GLN",
    "R": "ARG",
    "S": "SER",
    "T": "THR",
    "V": "VAL",
    "W": "TRP",
    "Y": "TYR",
}
THREE_2_ONE = {v: k for k, v in ONE_2_THREE.items()}

In [12]:
# bubble class:
class Bubble:
    def __init__(self, name, nodes: List[int] = [], extended: List[int] = []):
        self.name = name
        self.nodes = nodes
        self.extended = extended

    def __repr__(self) -> str:
        return f"{self.name} {self.nodes} {len(self.nodes)} {self.extended} {len(self.extended)}"

    def add_node(self, node):
        if isinstance(node, list):
            self.nodes += node
        else:
            self.nodes += [node]

    def add_extended(self, ext):
        if isinstance(ext, list):
            self.extended += ext
        else:
            self.extended += [ext]

    def add_extensions(self, extensions: List[int]) -> None:
        for ext in extensions:
            self.add_extended(ext)

    def remove_node(self, node):
        self.nodes.remove(node)

    def remove_extension(self, extension):
        self.extended.remove(extension)

    def remove_extensions(self, to_remove: List[int]) -> None:
        for to_ in to_remove:
            try:
                self.remove_extension(to_)
            except:
                raise ValueError(f"could not remove {to_} from {self.extended}")

    def get_all_nodes(self):
        return self.nodes + self.extended

In [13]:
# define some utility functions
def create_dir(name):
    if Path(name).exists():
        return
    else:
        os.mkdir(name)


def get_residue_by_num_chain(pdb, num_chain):
    for res in pdb.get_residues():
        if res.full_id[2] == num_chain[-1] and str(res._id[1]) == num_chain[:-1]:
            return res


def calc_min_residue_distance(pdb, num_chain1, num_chain2):
    res1 = get_residue_by_num_chain(pdb, num_chain1)
    res2 = get_residue_by_num_chain(pdb, num_chain2)
    min_dist = np.inf
    for a1 in res1.get_atoms():
        for a2 in res2.get_atoms():
            dist = np.linalg.norm(a1.get_coord() - a2.get_coord())
            min_dist = dist if dist < min_dist else min_dist
    return min_dist


def count_residues_closer_than_distance(
    pdb, num_chain1, num_chain2, distance_threshold, avoid_bb
):
    res1 = get_residue_by_num_chain(pdb, num_chain1)
    res2 = get_residue_by_num_chain(pdb, num_chain2)
    count = 0
    for a1 in res1.get_atoms():
        if avoid_bb and a1.name in BB_ATOMS:
            continue
        for a2 in res2.get_atoms():
            if avoid_bb and a2.name in BB_ATOMS:
                continue
            dist = np.linalg.norm(a1.get_coord() - a2.get_coord())
            if dist <= distance_threshold:
                count += 1
    return count


def parse_resfile(f):
    df = pd.read_csv(f, sep="\s+", skiprows=2, names=["pos", "chain", "PIKAA", "AAs"])
    df["len_AAs"] = df.AAs.map(len)
    df["num_chain"] = df["pos"].astype(str) + df["chain"]
    return df

## 02 setting up the initial paramteres. this is where you use the files you got from FuncLib

In [35]:
# fields for user input

# path in which to create files and folders
W_PATH = Path("./").absolute()
# path to Rosetta scripts executable
ROSETTA_SCRIPTS = "/home/labs/fleishman/rosaliel/Rosetta/main/source/build/src/release/linux/3.10/64/x86/gcc/5.4/default/rosetta_scripts.default.linuxgccrelease"
ROSETTA_DB = "/home/labs/fleishman/rosaliel/Rosetta/main/database"
# path to the refined PDB, retrieved from FuncLib
PDB_FILE = os.path.abspath(f"{W_PATH}/initial_data/refined.pdb")
pdb = PDBParser().get_structure("", PDB_FILE)
# the PDB chain to work on
CHAIN = "A"
# positions to work on
ALL_POSS = [
    42,
    44,
    46,
    61,
    64,
    68,
    69,
    110,
    112,
    145,
    150,
    163,
    165,
    167,
    181,
    201,
    220,
    224,
    14,
    16,
    18,
    72,
    108,
    119,
    123,
    185,
]
ALL_POSS_CHAIN = [f"{a}{CHAIN}" for a in ALL_POSS]
# path to the PSSM, retrieved from FuncLib
PSSM = f"{W_PATH}/initial_data/pssm"

RESFILES = {
    float(str(a)[-3:]): W_PATH / a
    for a in (W_PATH / "initial_data").glob("designable_aa_resfile.*.*")
}

LIGS = ["GYS"]
flags_file = W_PATH / "initial_data/flags"


def parse_flags(flags_file):
    flgs_to_ignore = ["database"]
    for l in open(flags_file):
        print(l.rstrip())
    return


flags = parse_flags(flags_file)

-database /home/labs/fleishman/rosaliel/Rosetta/main/database
-pdb_gz
-overwrite
-use_input_sc
-extrachi_cutoff 5
-ignore_unrecognized_res
-chemical:exclude_patches LowerDNA UpperDNA Cterm_amidation SpecialRotamer VirtualBB ShoveBB VirtualDNAPhosphate VirtualNTerm CTermConnect sc_orbitals pro_hydroxylated_case1 pro_hydroxylated_case2 ser_phosphorylated thr_phosphorylated tyr_phosphorylated tyr_sulfated lys_dimethylated lys_monomethylated lys_trimethylated lys_acetylated glu_carboxylated cys_acetylated tyr_diiodinated N_acetylated C_methylamidated MethylatedProteinCterm
-linmem_ig 10
-ignore_zero_occupancy false
-mute all
-no_nstruct_label true
-in:file:native initial_data/refined.pdb
-extra_res_fa initial_data/LG.params
-s initial_data/refined.pdb
-parser:protocol initial_data/mutate.xml
-pdb_gz
-parser:script_vars res_to_fix=94A,96A,121A,148A,203A,205A,222A,1X
-parser:script_vars fix_prefix=<
-parser:script_vars fix_TO=,fix_res
-parser:script_vars cst_full_path=initial_data/coord.cst


## 03 create the bubbles (neighborhoods)

In [19]:
# create bubbles
bubbles = dict((p, Bubble(name=p, nodes=[p], extended=[])) for p in ALL_POSS_CHAIN)

# extend each bubble to all positions that have at least 2 non-backbone atoms within 6A it.
# you can change the thresholds if you wish
for bub in bubbles.values():
    for p in ALL_POSS_CHAIN:
        if p not in bub.get_all_nodes():
            if count_residues_closer_than_distance(
                pdb, bub.nodes[0], p, 6, avoid_bb=True
            ):
                bub.add_extended(p)

# some bubble contain too many positions that are not trully interacting with the bubble center
# you can list those here, and they will be removed.
bubbles["42A"].remove_extensions(["69A"])
bubbles["46A"].remove_extensions(["61A", "18A"])


bubbles["68A"].remove_extensions(["16A", "69A", "72A"])
bubbles["69A"].remove_extensions(["42A", "112A", "68A", "224A"])
bubbles["72A"].remove_extensions(["68A", "69A"])

bubbles["112A"].remove_extensions(["69A", "123A"])
bubbles["165A"].remove_extensions(["145A"])
bubbles["224A"].remove_extensions(["150A"])

# use the printed selection statements to selct and view the various bubbles in PyMOL.
# use PyMOL to see which positions are in which bubble, so you can safely remove them.
for bub in bubbles.values():
    print(f"select bub_{bub.name}, resi {'+'.join([str(a[:-1]) for a in bub.nodes])}")
    print(
        f"select ext_{bub.name}, resi {'+'.join([str(a[:-1]) for a in bub.extended])}"
    )

select bub_42A, resi 42
select ext_42A, resi 44+46+68+220+224+14+72+119
select bub_44A, resi 44
select ext_44A, resi 42+46+64+68+220+14+16
select bub_46A, resi 46
select ext_46A, resi 42+44+64+68+220+14+16+123
select bub_61A, resi 61
select ext_61A, resi 46+64+145+167+220
select bub_64A, resi 64
select ext_64A, resi 44+46+61+220+16+18+123
select bub_68A, resi 68
select ext_68A, resi 42+44+46+112+14+119+123
select bub_69A, resi 69
select ext_69A, resi 150+163+165+201+72+185
select bub_110A, resi 110
select ext_110A, resi 112+18+108+123
select bub_112A, resi 112
select ext_112A, resi 68+110+14+119+185
select bub_145A, resi 145
select ext_145A, resi 61+165+167+181+220
select bub_150A, resi 150
select ext_150A, resi 69+163+165+167+181+201+224
select bub_163A, resi 163
select ext_163A, resi 69+150+165+181+201+185
select bub_165A, resi 165
select ext_165A, resi 69+150+163+167+181
select bub_167A, resi 167
select ext_167A, resi 61+145+150+165+181
select bub_181A, resi 181
select ext_181A, res

In [30]:
# create bubble resfiles
# each bubble will have it's own resfile for each ∆∆G threhsold
BUB_RESFILES = {}
bubb_resfiles_path = W_PATH / "bubb_resfiles"
create_dir(bubb_resfiles_path)
wt_pos_aa = dict(
    (p, THREE_2_ONE[get_residue_by_num_chain(pdb, p).resname]) for p in ALL_POSS_CHAIN
)
for fs, ori_rf_file in RESFILES.items():
    ori_rf_main = parse_resfile(ori_rf_file)
    BUB_RESFILES[fs] = {}
    for bub in bubbles.values():
        ori_rf = deepcopy(ori_rf_main)

        resfile = {
            p: aas for p, aas in zip(ori_rf.num_chain, ori_rf.AAs) if p in bub.extended
        }
        for pos, aa in wt_pos_aa.items():
            if pos not in resfile.keys():
                resfile[pos] = aa
        if all(
            [len(resfile[p]) == 1 for p in ori_rf.num_chain if int(p[:-1]) in ALL_POSS]
        ):
            continue
        resfile = dict(sorted(resfile.items(), key=lambda x: int(x[0][:-1])))

        rf_file = f"{str(bubb_resfiles_path)}/bub{bub.name}_{fs}.resfile"
        with open(rf_file, "w+") as fout:
            fout.write("nataa\nstart\n")
            for k, v in resfile.items():
                fout.write(f"{k[:-1]: <8}{CHAIN}       PIKAA   {v}\n")
        BUB_RESFILES[fs][bub.name] = rf_file

In [31]:
# running this block will print how many jobs will be required to cover each bubble for every ∆∆G
# threshold.
count_by_ddg = dict((ddg, 0) for ddg in sorted(BUB_RESFILES.keys()))
for ddg in count_by_ddg.keys():
    jobs_per_ddg = 0
    for bub in bubbles.values():
        if bub.name not in BUB_RESFILES[ddg].keys():
            continue
        len_perms = np.product(
            parse_resfile(BUB_RESFILES[ddg][bub.name]).len_AAs.tolist()
        )
        count_by_ddg[ddg] += len_perms
        jobs = len_perms if len_perms * 0.15 < 1000 else round(len_perms * 0.15)
        jobs_per_ddg += jobs
        print(
            f"bubble {bub.name} ext {len(bub.extended)} dG {ddg:.1f} perms: {len_perms} jobs: {jobs}"
        )
    print(f"∆∆G {ddg:.1f} {count_by_ddg[ddg]} #jobs {jobs_per_ddg}")
    print("*****************************************************************")
    if count_by_ddg[ddg] > 10000000:
        break

bubble 42A ext 8 dG 0.0 perms: 8 jobs: 8
bubble 44A ext 7 dG 0.0 perms: 4 jobs: 4
bubble 46A ext 8 dG 0.0 perms: 8 jobs: 8
bubble 61A ext 5 dG 0.0 perms: 2 jobs: 2
bubble 64A ext 7 dG 0.0 perms: 8 jobs: 8
bubble 68A ext 7 dG 0.0 perms: 2 jobs: 2
bubble 69A ext 6 dG 0.0 perms: 8 jobs: 8
bubble 110A ext 4 dG 0.0 perms: 20 jobs: 20
bubble 112A ext 5 dG 0.0 perms: 2 jobs: 2
bubble 145A ext 5 dG 0.0 perms: 2 jobs: 2
bubble 150A ext 7 dG 0.0 perms: 4 jobs: 4
bubble 163A ext 6 dG 0.0 perms: 8 jobs: 8
bubble 165A ext 5 dG 0.0 perms: 8 jobs: 8
bubble 167A ext 5 dG 0.0 perms: 8 jobs: 8
bubble 181A ext 5 dG 0.0 perms: 4 jobs: 4
bubble 201A ext 5 dG 0.0 perms: 16 jobs: 16
bubble 220A ext 6 dG 0.0 perms: 2 jobs: 2
bubble 224A ext 4 dG 0.0 perms: 8 jobs: 8
bubble 14A ext 7 dG 0.0 perms: 4 jobs: 4
bubble 16A ext 7 dG 0.0 perms: 8 jobs: 8
bubble 18A ext 6 dG 0.0 perms: 20 jobs: 20
bubble 108A ext 3 dG 0.0 perms: 4 jobs: 4
bubble 119A ext 4 dG 0.0 perms: 2 jobs: 2
bubble 123A ext 8 dG 0.0 perms: 40 job

## 04 now it's time to choose a ∆∆G threshold
we recommend to sticking to <10^6 jobs, so you don't wait too long.
the above blocks calcualte the number of jobs required for each ∆∆G threshold.
use their output to choose.
if you are unsatisfied with the sequence space, try pruning more positions from the bubbles that require the most jobs to model.

In [113]:
CHOSEN_DDG = 3.0
chosen_rf_df = parse_resfile(RESFILES[CHOSEN_DDG])
chosen_rf_df

Unnamed: 0,pos,chain,PIKAA,AAs,len_AAs,num_chain
0,14,A,PIKAA,I,1,14A
1,16,A,PIKAA,VI,2,16A
2,18,A,PIKAA,LF,2,18A
3,42,A,PIKAA,LIV,3,42A
4,44,A,PIKAA,LIM,3,44A
5,46,A,PIKAA,F,1,46A
6,61,A,PIKAA,VMT,3,61A
7,64,A,PIKAA,L,1,64A
8,66,A,PIKAA,YFLM,4,66A
9,68,A,PIKAA,VALMT,5,68A


## 05 prepare and run the design jobs

In [38]:
DESIGN_PATH = "%s/bubb_design" % W_PATH
create_dir(DESIGN_PATH)

total = 0
for bub in tqdm(bubbles.values()):
    bub_nodes = sorted(bub.get_all_nodes(), key=lambda x: int(x[:-1]))

    combs = [
        list(chosen_rf_df.loc[chosen_rf_df["num_chain"] == n, "AAs"].values[0])
        for n in bub_nodes
    ]
    perms = list(product(*combs))
    if len(perms) > 1000:
        selected_combs = random.choices(perms, k=int(0.15 * len(perms)))
    else:
        selected_combs = perms
    print(f"for bubble {bub.name}, creating {len(selected_combs)} jobs")
    total += len(selected_combs)
    create_dir(f"{DESIGN_PATH}/{bub.name}")
    create_dir(f"{DESIGN_PATH}/{bub.name}/scores")

    with open(f"{DESIGN_PATH}/{bub.name}/selected_combinations.txt", "w+") as fout:
        job = f"{ROSETTA_SCRIPTS} "
        job += f"-database {ROSETTA_DB} "
        job += f"@{W_PATH}/initial_data/flags "
        job += f"-out:prefix wt_ "
        job += f"-out:file:score_only {DESIGN_PATH}/{bub.name}/scores/wt.sc "

        job += f"-parser:script_vars "
        job += f'all_ress={",".join(f"{p}A" for p in sorted(ALL_POSS))} '
        job += "\n"

        fout.write(job)
        for perm in selected_combs:
            job = f"{ROSETTA_SCRIPTS} -database {ROSETTA_DB} "
            job += f"@{W_PATH}/initial_data/flags "
            job += f'-out:prefix {"_".join(p+aa for p, aa in zip(bub_nodes, perm))}_ '
            job += f'-out:file:score_only {DESIGN_PATH}/{bub.name}/scores/{"_".join(p+aa for p, aa in zip(bub_nodes, perm))}.sc '

            job += f"-parser:script_vars "
            job += f'all_ress={",".join(f"{p}A" for p in sorted(ALL_POSS) if p != 66)} '
            for i, (pos, aa) in enumerate(zip(bub_nodes, perm)):
                job += f"new_res{i+1}={ONE_2_THREE[aa]} target{i+1}={pos} "
            job += "\n"
            fout.write(job)
print(f"\nCreated a total of {total} jobs.")

  0%|          | 0/26 [00:00<?, ?it/s]

for bubble 42A, creating 945 jobs
for bubble 44A, creating 270 jobs
for bubble 46A, creating 202 jobs
for bubble 61A, creating 72 jobs
for bubble 64A, creating 540 jobs
for bubble 68A, creating 675 jobs
for bubble 69A, creating 252 jobs
for bubble 110A, creating 162 jobs
for bubble 112A, creating 135 jobs
for bubble 145A, creating 360 jobs
for bubble 150A, creating 400 jobs
for bubble 163A, creating 180 jobs
for bubble 165A, creating 200 jobs
for bubble 167A, creating 480 jobs
for bubble 181A, creating 160 jobs
for bubble 201A, creating 560 jobs
for bubble 220A, creating 648 jobs
for bubble 224A, creating 420 jobs
for bubble 14A, creating 270 jobs
for bubble 16A, creating 300 jobs
for bubble 18A, creating 720 jobs
for bubble 72A, creating 42 jobs
for bubble 108A, creating 360 jobs
for bubble 119A, creating 45 jobs
for bubble 123A, creating 1620 jobs
for bubble 185A, creating 90 jobs

Created a total of 10108 jobs.


## 06 This is where you run all jobs and aggregate the resulting scores.

### parse all score files

In [74]:
with open(Path(DESIGN_PATH) / "all_scores.sc", "w+") as all_fout:
    for i, bub in tqdm(enumerate(bubbles.values()), total=len(bubbles)):
        print(f'for bubble {bub.name} found {len(list((Path(DESIGN_PATH) / bub.name /"scores").glob("*")))} scores')
        with open(Path(DESIGN_PATH) / bub.name / f"all_{bub.name}.sc", "w+") as fout:
            txt = open(Path(DESIGN_PATH) / bub.name /"scores/wt.sc", "r").read().split("\n")
            fout.writelines(txt[1] + "\n")
            if i == 0:
                all_fout.writelines(txt[1] + "\n")
            for f in (Path(DESIGN_PATH) / bub.name /"scores").glob("*.sc"):
                txt = open(f, "r").read().split("\n")
                fout.writelines(txt[2] + "\n")
                all_fout.writelines(txt[2] + "\n")

  0%|          | 0/26 [00:00<?, ?it/s]

for bubble 42A found 946 scores
for bubble 44A found 271 scores
for bubble 46A found 184 scores
for bubble 61A found 73 scores
for bubble 64A found 541 scores
for bubble 68A found 676 scores
for bubble 69A found 240 scores
for bubble 110A found 154 scores
for bubble 112A found 136 scores
for bubble 145A found 361 scores
for bubble 150A found 401 scores
for bubble 163A found 168 scores
for bubble 165A found 201 scores
for bubble 167A found 481 scores
for bubble 181A found 161 scores
for bubble 201A found 561 scores
for bubble 220A found 649 scores
for bubble 224A found 421 scores
for bubble 14A found 271 scores
for bubble 16A found 301 scores
for bubble 18A found 721 scores
for bubble 72A found 43 scores
for bubble 108A found 361 scores
for bubble 119A found 46 scores
for bubble 123A found 1115 scores
for bubble 185A found 91 scores


In [75]:
score_df = pd.read_csv(Path(DESIGN_PATH) / "all_scores.sc", sep="\s+")
score_df

Unnamed: 0,SCORE:,total_score,coordinate_constraint,designable,dslf_fa13,fa_atr,fa_dun,fa_elec,fa_intra_rep,fa_intra_sol_xover4,...,hbond_sc,hbond_sr_bb,lk_ball_wtd,omega,p_aa_pp,pro_close,rama_prepro,ref,yhh_planarity,description
0,SCORE:,-723.948,1.034,126.0,0.0,-1368.315,288.487,-399.611,2.539,46.396,...,-51.317,-26.518,-14.113,32.150,-55.417,0.616,7.052,43.062,0.061,14AI_42AI_44AM_46AF_68AM_72AT_119AL_220AL_224A...
1,SCORE:,-722.231,0.952,125.0,0.0,-1363.764,284.696,-404.692,2.533,46.510,...,-51.817,-26.104,-13.437,31.450,-55.262,0.624,8.135,42.267,0.062,14AI_42AI_44AI_46AF_68AM_72AS_119AL_220AL_224A...
2,SCORE:,-728.819,1.403,123.0,0.0,-1356.934,284.763,-401.799,2.547,46.376,...,-52.010,-26.071,-13.276,31.292,-56.685,0.685,3.380,45.848,0.060,14AI_42AI_44AI_46AF_68AA_72AV_119AL_220AV_224A...
3,SCORE:,-716.175,1.670,127.0,0.0,-1358.593,290.850,-402.468,2.570,46.670,...,-52.086,-27.141,-14.186,31.699,-56.158,0.710,4.857,42.271,0.058,14AI_42AL_44AI_46AF_68AL_72AS_119AL_220AI_224A...
4,SCORE:,-719.694,0.771,123.0,0.0,-1359.891,284.000,-405.930,2.522,46.294,...,-52.207,-26.385,-13.832,31.156,-52.760,0.602,8.960,46.493,0.062,14AI_42AL_44AL_46AF_68AV_72AC_119AL_220AV_224A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9569,SCORE:,-719.213,1.250,124.0,0.0,-1364.257,286.735,-399.173,2.540,46.475,...,-45.717,-27.141,-14.715,30.258,-53.003,0.701,8.736,46.762,0.061,69AH_112AI_163AV_185AV_refined
9570,SCORE:,-709.020,0.748,123.0,0.0,-1350.439,279.293,-403.212,2.521,46.280,...,-46.680,-26.273,-14.617,32.067,-52.507,0.613,9.437,43.765,0.062,69AD_112AT_163AV_185AV_refined
9571,SCORE:,-716.042,0.861,123.0,0.0,-1349.806,278.189,-402.136,2.512,46.002,...,-46.559,-26.397,-13.867,31.836,-52.220,0.613,9.818,47.408,0.062,69AA_112AV_163AV_185AA_refined
9572,SCORE:,-713.561,1.012,122.0,0.0,-1355.586,283.102,-398.640,2.518,46.296,...,-45.712,-26.477,-15.578,31.215,-52.470,0.619,9.237,43.186,0.062,69AE_112AT_163AV_185AV_refined


### prepare additional columns

In [106]:
score_df["mut_dict"] = score_df["description"].str.split("_").map(lambda x: {a[:-1]: a[-1] for a in x})
poss = []
for p in tqdm(ALL_POSS_CHAIN):
    wt_aa = chosen_rf_df.loc[chosen_rf_df["pos"] == int(p[:-1]), "AAs"].values[0][0]
    poss.append(f'{wt_aa}{p[:-1]}')
    score_df[f'{wt_aa}{p[:-1]}'] = score_df["mut_dict"].map(lambda x: x[p] if p in x.keys() else wt_aa)

# num_dict = number of mutations
score_df["num_muts"]= score_df.apply(lambda r: sum([r[p] != p[0] for p in poss]), axis=1)
poss = sorted(poss, key=lambda x: int(x[1:]))
score_df

  0%|          | 0/26 [00:00<?, ?it/s]

Unnamed: 0,SCORE:,total_score,coordinate_constraint,designable,dslf_fa13,fa_atr,fa_dun,fa_elec,fa_intra_rep,fa_intra_sol_xover4,...,V224,I14,V16,L18,S72,T108,L119,I123,N185,num_muts
0,SCORE:,-723.948,1.034,126.0,0.0,-1368.315,288.487,-399.611,2.539,46.396,...,V,I,V,L,T,T,L,I,N,4
1,SCORE:,-722.231,0.952,125.0,0.0,-1363.764,284.696,-404.692,2.533,46.510,...,V,I,V,L,S,T,L,I,N,3
2,SCORE:,-728.819,1.403,123.0,0.0,-1356.934,284.763,-401.799,2.547,46.376,...,V,I,V,L,V,T,L,I,N,5
3,SCORE:,-716.175,1.670,127.0,0.0,-1358.593,290.850,-402.468,2.570,46.670,...,V,I,V,L,S,T,L,I,N,3
4,SCORE:,-719.694,0.771,123.0,0.0,-1359.891,284.000,-405.930,2.522,46.294,...,V,I,V,L,C,T,L,I,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9569,SCORE:,-719.213,1.250,124.0,0.0,-1364.257,286.735,-399.173,2.540,46.475,...,V,I,V,L,S,T,L,I,V,3
9570,SCORE:,-709.020,0.748,123.0,0.0,-1350.439,279.293,-403.212,2.521,46.280,...,V,I,V,L,S,T,L,I,V,3
9571,SCORE:,-716.042,0.861,123.0,0.0,-1349.806,278.189,-402.136,2.512,46.002,...,V,I,V,L,S,T,L,I,A,2
9572,SCORE:,-713.561,1.012,122.0,0.0,-1355.586,283.102,-398.640,2.518,46.296,...,V,I,V,L,S,T,L,I,V,3


In [107]:
display_all(score_df[poss + ["num_muts"]], 5)

Unnamed: 0,I14,V16,L18,L42,L44,F46,V61,L64,V68,Q69,S72,T108,A110,V112,L119,I123,Y145,V150,V163,F165,T167,H181,N185,L201,L220,V224,num_muts
0,I,V,L,I,M,F,V,L,M,Q,T,T,A,V,L,I,Y,V,V,F,T,H,N,L,L,V,4
1,I,V,L,I,I,F,V,L,M,Q,S,T,A,V,L,I,Y,V,V,F,T,H,N,L,L,V,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9572,I,V,L,L,L,F,V,L,V,E,S,T,A,T,L,I,Y,V,V,F,T,H,V,L,L,V,3
9573,I,V,L,L,L,F,V,L,V,Q,S,T,A,I,L,I,Y,V,V,F,T,H,A,L,L,V,2


In [111]:
score_df.to_csv(Path(DESIGN_PATH) / "score_df.csv", index=False)