In [None]:
# start coding here
import pandas as pd

df = pd.read_csv(snakemake.input["syn_df"])
abag_df = pd.read_csv(snakemake.input["abag_df"], index_col=0).reset_index(drop=True)

In [None]:
# start from the unmutated PDBs
abag_df["pdb_trimmed"] = abag_df["pdb"].apply(lambda v: v[:4])

overlapping_ids = pd.Index(df["pdb"].drop_duplicates()).intersection(
    pd.Index(abag_df["pdb_trimmed"].drop_duplicates())
)
len(overlapping_ids)

In [None]:
originals = abag_df.set_index("pdb_trimmed").loc[overlapping_ids]
# exclude duplicates (slightly sloppily)
originals = originals.loc[originals.pdb.str.endswith("_1")]
len(originals)

In [None]:
# Convert ΔΔG labels to Δ-log(Kd) labels
# Negative labels indicate worse binding (checked in the publication)
df["delta_logkd"] = (df["labels"]/(293.15 * 0.001987)) * 0.434  # 0.434 = log(10), 293.15 = 20 celsius in kelvin,  0.001987 = R in kcal/mol/K
originals["delta_logkd"] = 0

In [None]:
# concat the two dataframes
merged = pd.concat([originals, df], axis=0, ignore_index=True)

In [None]:
originals.iloc[0]

In [None]:
originals["-log(Kd)"].get("6fe4", 8)

In [None]:
# compute the mutated absolute neglogkds
merged["orig_neglogkd"] = merged["pdb"].apply(lambda pdb: originals["-log(Kd)"].get(pdb[:4], snakemake.params.offset_neglogkd))
merged["-log(Kd)"] = merged["orig_neglogkd"] + merged["delta_logkd"]

In [None]:
merged.delta_logkd.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.displot(merged.delta_logkd)
plt.xlim([-5, 3])

In [None]:
# set pdb
merged["pdb_full"] = merged["pdb"]
merged["pdb"] = merged["pdb"].apply(lambda v: v[:4])

# fill complex where it's nan (to pdb) and set index
merged.index = merged["complex"].fillna(merged["pdb"])


In [None]:
merged

In [None]:
merged["mutation"] = merged["complex"].apply(lambda v: v.split("_")[1] if "_" in str(v) else "")
merged["filename"] = merged.apply(lambda row: row["filename"].replace("_1", "") if "_1" in str(row["filename"]) else
                          f"{row.pdb}_{row.ab_chain}_{row.ag_chain}_{row.mutation}.pdb", axis=1)
# df["mutation_code"] = df.apply(lambda L: L.complex.split('_')[-1], axis=1)

## Now split into absolute and relative part

In [None]:
merged["test"] = False

absolute = merged.loc[merged.pdb.isin(originals.index)].copy()
relative = merged.loc[~merged.pdb.isin(originals.index)].copy()


In [None]:
len(absolute), len(relative)

In [None]:
# split relative

num_rel_splits = 15
all_pdbs = relative["pdb"].drop_duplicates().tolist()
relative["validation"] = relative["pdb"].apply(lambda pdb: all_pdbs.index(pdb) % num_rel_splits)

In [None]:
num_abs_splits = 5
all_pdbs = absolute["pdb"].drop_duplicates().tolist()

absolute["validation"] = absolute["pdb"].apply(lambda pdb: all_pdbs.index(pdb) % num_abs_splits)
absolute["test"] = absolute["validation"] == 0

In [None]:
merged.iloc[-1]

In [None]:
merged.dropna(axis=1, how='any').iloc[-1]

In [None]:
relative.dropna(axis=1, how='any').to_csv(snakemake.output["relative"])
absolute.dropna(axis=1, how='any').to_csv(snakemake.output["absolute"])
