# AbAg_Affinity dataset

This notebook generates a dataframe containing information about data first version of the Antibody-Antigen_complexes dataset:

- Overlap of AbDb data (filtered and enriched abag data) with SAbDab data (affinity values for conformtions) 

In [5]:
import os
import pandas as pd
from abag_affinity.utils.config import read_config, get_data_paths
import matplotlib.pyplot as plt
import seaborn as sns

config = read_config("../../config.yaml")
summary_path = os.path.join(config["DATASETS"]["path"], config["DATASETS"]["abag_affinity"]["folder_path"], config["DATASETS"]["abag_affinity"]["summary"])

In [7]:
summary_df = pd.read_csv(summary_path, index_col=0)
print(summary_df)
summary_df["validation"] = summary_df["validation"].astype(str).apply(lambda x: x.split(".")[0] if x != "nan" else "0")
summary_df.head()

                pdb antibody_chains antigen_chains  in_dataset  antigen_length
abdb_filename                                                                 
5C0S_1.pdb     5c0s      ['L', 'H']          ['A']        True             242
4AM0_2.pdb     4am0      ['L', 'H']          ['S']        True              92
4M1D_1.pdb     4m1d      ['L', 'H']          ['P']        True              14
4YPG_1.pdb     4ypg      ['L', 'H']          ['D']        True             161
5CJQ_1.pdb     5cjq      ['L', 'H']     ['A', 'B']        True             206
...             ...             ...            ...         ...             ...
4RQS_1.pdb     4rqs      ['L', 'H']          ['G']       False             295
3ZE0_1.pdb     3ze0      ['L', 'H']          ['A']       False             455
5XCQ_1.pdb     5xcq      ['L', 'H']          ['C']       False               6
6N8D_1.pdb     6n8d      ['L', 'H']          ['C']       False             306
5MUB_3.pdb     5mub      ['L', 'H']          ['T']  

KeyError: 'validation'

In [None]:
print(summary_df)

In [3]:
num_samples = len(summary_df["pdb"].unique())
print(f"Total lenght of AbAg-Affinity dataset = {num_samples}")

Total lenght of AbAg-Affinity dataset = 385


In [None]:
summary_df.groupby("validation")["pdb"].count()

## Affinity distribution

### Delta G analysis

In [None]:
summary_df["delta_g"].describe()

In [None]:
sns.histplot(summary_df, x="delta_g", kde=True)

In [None]:
#summary_df.loc[summary_df["validation"] == 3, "validation"] = 2 # only compare to validation set 1 for now
sns.histplot(summary_df, x="delta_g", hue="validation", kde=True)

## -log(Kg) Analyis

In [None]:
summary_df["-log(Kd)"].describe()

In [None]:
sns.histplot(summary_df, x="-log(Kd)", stat="percent", kde=True)
plt.savefig("./plots/abag_affinity_-log(Kd)_distribution.png")

In [None]:
sns.histplot(summary_df, x="-log(Kd)", hue="validation", stat="percent", kde=True)
plt.savefig("./plots/abag_affinity_-log(Kd)_distribution_by_val_set.png")

In [None]:
sns.histplot(summary_df, x="-log(Kd)", hue="test", stat="percent",kde=True)
plt.savefig("./plots/abag_affinity_-log(Kd)_distribution_train_test.png")