# Method Comparison
Comparison of MD and cheminformatics methods.

In [2]:
# from src.utils import json_load
import src.utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

plt.style.use("seaborn-paper")
plt.rcParams["svg.fonttype"] = "none"
sns.set_context("paper")

# set matplotlib font sizes
SMALL_SIZE = 13
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc("font", size=MEDIUM_SIZE)  # controls default text sizes
plt.rc("axes", titlesize=BIGGER_SIZE)  # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("legend", fontsize=MEDIUM_SIZE)  # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title

DPI = 600

In [3]:
method_len = int(len(snakemake.wildcards.methods.split("-")) / 2)
confgen_len = int(len(snakemake.wildcards.conf_gens.split("-")) / 2)

results = [
    src.utils.json_load(i) for i in snakemake.input[0 : 2 * method_len : 2]
]
file_names = snakemake.input[0:method_len]
if len(snakemake.input) < (method_len + confgen_len):
    add_confgens = False
else:
    confgen = [src.utils.json_load(i) for i in snakemake.input[method_len:]]
    confgen_names = []
    for i, j in zip(
        snakemake.wildcards.conf_gens.split("-")[0::2],
        snakemake.wildcards.conf_gens.split("-")[1::2],
    ):
        confgen_names.append("-".join([i, j]))
    add_confgens = True

In [4]:
df = pd.DataFrame()
dfs = {
    "RMSD": pd.DataFrame(),
    "MAE": pd.DataFrame(),
    "percentage_fulfilled": pd.DataFrame(),
}
# Load methods
for a in file_names:
    method_dict = src.utils.json_load(a)
    method_dict_n = {}
    for key_o, value_o in method_dict.items():
        df3 = pd.DataFrame.from_dict(method_dict["RMSD"])
        method_dict_n[key_o] = {}
        for key, value in value_o.items():
            tmp_dict = {}
            #             print(key, value)
            for k, v in value.items():
                tmp_k = k.split("-")[1]
                if tmp_k.split("_")[-1] == "single":
                    tmp_k = tmp_k.split("_")[0:-1]
                    tmp_k = "_".join(tmp_k)
                tmp_dict[tmp_k] = v
            method_dict_n[key_o][
                f"{key}-{a.split('/')[5].split('NOE')[0]}"
            ] = tmp_dict
        df2 = pd.DataFrame.from_dict(method_dict_n[key_o])
        dfs[key_o] = pd.concat([dfs[key_o], df2], axis=1)

In [5]:
confgen_keys = {"RMSD": "rmsd", "MAE": "mae", "percentage_fulfilled": "fulfil"}
if add_confgens:
    for c, n in zip(confgen, confgen_names):
        for key, value in dfs.items():
            confgen_key = confgen_keys[key]
            c[confgen_key]["licuv"] = {
                k.split("_single")[0]: v
                for k, v in c[confgen_key]["licuv"].items()
            }
            c[confgen_key]["namfis"] = {
                k.split("_single")[0]: v
                for k, v in c[confgen_key]["namfis"].items()
            }
            c[confgen_key]["low_energy"] = {
                k.split("_single")[0]: v
                for k, v in c[confgen_key]["low_energy"].items()
            }
            c[confgen_key]["random"] = {
                k.split("_single")[0]: v
                for k, v in c[confgen_key]["random"].items()
            }
            df2 = pd.DataFrame.from_dict(
                c[confgen_key]["best"],
                orient="index",
                columns=[f"{n} {'best'}"],
                dtype=float,
            )
            l_columns = [f"{n} licuv {l}" for l in [1, 3, 5, 10, 30]]
            df3 = pd.DataFrame.from_dict(
                c[confgen_key]["licuv"],
                orient="index",
                columns=[f"{n}-licuv-{l}" for l in [1, 3, 5, 10, 30]],
                dtype=float,
            )
            df3 = df3.drop(
                columns=[f"{n}-licuv-{l}" for l in [1, 3, 5, 30]]
            )  # 10,
            df4 = pd.DataFrame.from_dict(
                c[confgen_key]["low_energy"],
                orient="index",
                columns=[f"{n}-low_energy-{l}" for l in [1, 3, 5, 10, 30]],
                dtype=float,
            )
            df4 = df4.drop(
                columns=[f"{n}-low_energy-{l}" for l in [1, 3, 5, 30]]
            )  # 10,
            df5 = pd.DataFrame.from_dict(
                c[confgen_key]["random"],
                orient="index",
                columns=[f"{n}-random-{l}" for l in [1, 3, 5, 10, 30]],
                dtype=float,
            )
            df5 = df5.drop(
                columns=[f"{n}-random-{l}" for l in [1, 3, 5, 30]]
            )  # 10,
            df6 = pd.DataFrame.from_dict(
                c[confgen_key]["namfis"],
                orient="index",
                columns=[f"{n}-namfis-{l}" for l in [1, 3, 5, 10, 30]],
                dtype=float,
            )
            df6 = df6.drop(
                columns=[f"{n}-namfis-{l}" for l in [1, 3, 5, 30]]
            )  # 10,
            dfs[key] = pd.concat([dfs[key], df2, df3, df4, df5, df6], axis=1)

In [6]:
dfs["percentage_fulfilled"].rename(
    columns=lambda x: x.replace("value-", "ensemble ")
    .replace("most-populated-1-", "cluster ")
    .replace("-H2O-2000-nan-", "")
    .replace("-native-2000-nan-", "")
    .replace("-native-2000-3-", "")
    .replace("omega-basic", "Omega")
    .replace("rdkit-ETKDGv3mmff", "RDKit ")
    .replace("-", " ")
    .replace("  ", " ")
    .replace("namfis", "NAMFIS")
    .replace("licuv", "LICUV"),
    inplace=True,
)

dfs["RMSD"].rename(
    columns=lambda x: x.replace("value-", "ensemble ")
    .replace("most-populated-1-", "cluster ")
    .replace("-H2O-2000-nan-", "")
    .replace("-native-2000-nan-", "")
    .replace("-native-2000-3-", "")
    .replace("omega-basic", "Omega")
    .replace("rdkit-ETKDGv3mmff", "RDKit ")
    .replace("-", " ")
    .replace("  ", " ")
    .replace("namfis", "NAMFIS")
    .replace("licuv", "LICUV"),
    inplace=True,
)

dfs["percentage_fulfilled"].index = (
    dfs["percentage_fulfilled"]
    .index.str.replace("_cis", "c")
    .str.replace("_trans", "t")
)
dfs["RMSD"].index = (
    dfs["RMSD"].index.str.replace("_cis", "c").str.replace("_trans", "t")
)

# Sort by index
dfs["percentage_fulfilled"] = dfs["percentage_fulfilled"].sort_index()
dfs["RMSD"] = dfs["RMSD"].sort_index()

In [7]:
fig, axs = plt.subplots(
    2, 2, figsize=(12, 6), gridspec_kw={"width_ratios": [3, 1]}
)
df = dfs["percentage_fulfilled"]
df = df.drop(columns=["cluster GaMD", "cluster cMD"])

methods_label = {a: idx for idx, a in enumerate(df)}
methods_short = {
    " ".join(a.split("-")[0:2])
    .replace("value", "")
    .replace("basic", "Macrocycle"): idx
    for idx, a in enumerate(df)
}
# Scatter plot % fulfilled
for column in df:
    axs[0, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
axs[0, 0].set_ylabel("% NOE fulfilled")
axs[0, 0].set_xlabel("Compounds")
axs[0, 0].set_ylim([0, 1])

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))
fig.autofmt_xdate(rotation=45, ha="center")

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b]).pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs[0, 1],
    annot=True,
    cmap=colorpalette,
)
axs[0, 1].set_title("Paired t-test")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs[1, 1],
    annot=True,
    cmap=colorpalette,
)
axs[1, 1].set_title("Wilcoxon signed rank test")

# Scatter plot RMSD
df = dfs["RMSD"]
df = df.drop(columns=["cluster GaMD", "cluster cMD"])
for column in df:
    axs[1, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
handles, labels = axs[1, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc="lower center")
axs[1, 0].set_ylabel("RMSD [$\AA$]")
axs[1, 0].set_xlabel("Compounds")

for tick in axs[1, 0].xaxis.get_major_ticks()[1::2]:
    tick.set_pad(25)

fig.autofmt_xdate(rotation=45, ha="center")
fig.tight_layout()
# fig.savefig(snakemake.output.plot, dpi=300)

In [8]:
fig1, axs1 = plt.subplots(
    2,
    2,
    figsize=(10, 6),
    gridspec_kw={"width_ratios": [3, 1]},
    sharey="row",
    sharex="col",
)
fig2, axs2 = plt.subplots(
    2,
    2,
    figsize=(10, 10),
    gridspec_kw={"width_ratios": [1, 1]},
    sharex=True,
    sharey=True,
)
df = dfs["percentage_fulfilled"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "Omega best",
        "RDKit best",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)

methods_label = {a: idx for idx, a in enumerate(df)}
methods_short = {
    " ".join(a.split("-")[0:2])
    .replace("value", "")
    .replace("basic", "Macrocycle"): idx
    for idx, a in enumerate(df)
}
# Scatter plot % fulfilled
for column in df:
    axs1[0, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
axs1[0, 0].set_ylabel("% NOE fulfilled [1/100 %]")
axs1[0, 0].set_xlabel("Compounds")
axs1[0, 0].set_ylim([0, 1])
fig1.autofmt_xdate(rotation=45, ha="center")

# Distributions
sns.boxplot(data=df, ax=axs1[0, 1], showmeans=True)
axs1[0, 1].set_title("Distributions")

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[0, 0].set_title("Paired t-test (% NOE fulfilled)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[0, 1].set_title("Wilcoxon signed rank test \n(% NOE fulfilled)")


# Scatter plot RMSD
df = dfs["RMSD"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "Omega best",
        "RDKit best",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)

for column in df:
    axs1[1, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
handles, labels = axs1[1, 0].get_legend_handles_labels()
fig1.legend(handles, labels, loc="lower center")
axs1[1, 0].set_ylabel("RMSD [$\AA$]")
axs1[1, 0].set_xlabel("Compounds")

for tick in axs1[1, 0].xaxis.get_major_ticks()[1::2]:
    tick.set_pad(25)

# Distributions
sns.boxplot(data=df, ax=axs1[1, 1], showmeans=True)
# axs1[1,1].set_title('Distributions')

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[1, 0].set_title("Paired t-test (RMSD)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[1, 1].set_title("Wilcoxon signed rank test (RMSD)")

fig1.autofmt_xdate(rotation=45, ha="center")
fig2.autofmt_xdate(rotation=0, ha="center")
fig1.tight_layout()
fig1.savefig(snakemake.output.plot1, dpi=300)
fig2.tight_layout()
fig2.savefig(snakemake.output.plot1_sig, dpi=300)

In [9]:
fig1, axs1 = plt.subplots(
    2,
    2,
    figsize=(10, 6),
    gridspec_kw={"width_ratios": [3, 1]},
    sharey="row",
    sharex="col",
)
fig2, axs2 = plt.subplots(
    2,
    2,
    figsize=(10, 10),
    gridspec_kw={"width_ratios": [1, 1]},
    sharex=True,
    sharey=True,
)
df = dfs["percentage_fulfilled"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)

methods_label = {a: idx for idx, a in enumerate(df)}
methods_short = {
    " ".join(a.split("-")[0:2])
    .replace("value", "")
    .replace("basic", "Macrocycle"): idx
    for idx, a in enumerate(df)
}
# Scatter plot % fulfilled
for column in df:
    axs1[0, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
axs1[0, 0].set_ylabel("% NOE fulfilled [1/100 %]")
axs1[0, 0].set_xlabel("Compounds")
axs1[0, 0].set_ylim([0, 1])
fig1.autofmt_xdate(rotation=45, ha="center")

for tick in axs1[1, 0].xaxis.get_major_ticks()[1::2]:
    tick.set_pad(25)

# Distributions
sns.boxplot(data=df, ax=axs1[0, 1], showmeans=True)
axs1[0, 1].set_title("Distributions")

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[0, 0].set_title("Paired t-test (% NOE fulfilled)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[0, 1].set_title("Wilcoxon signed rank test \n(% NOE fulfilled)")


# Scatter plot RMSD
df = dfs["RMSD"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)

for column in df:
    axs1[1, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
handles, labels = axs1[1, 0].get_legend_handles_labels()
fig1.legend(handles, labels, loc="lower center")
axs1[1, 0].set_ylabel("RMSD [$\AA$]")
axs1[1, 0].set_xlabel("Compounds")

for tick in axs1[1, 0].xaxis.get_major_ticks()[1::2]:
    tick.set_pad(25)

# Distributions
sns.boxplot(data=df, ax=axs1[1, 1], showmeans=True)
# axs1[1,1].set_title('Distributions')

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[1, 0].set_title("Paired t-test (RMSD)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[1, 1].set_title("Wilcoxon signed rank test (RMSD)")

fig1.autofmt_xdate(rotation=45, ha="center")
fig1.tight_layout()
fig1.savefig(snakemake.output.plot2, dpi=300)
fig2.tight_layout()
fig2.savefig(snakemake.output.plot2_sig, dpi=300)

In [10]:
fig1, axs1 = plt.subplots(
    2,
    2,
    figsize=(10, 6),
    gridspec_kw={"width_ratios": [3, 1]},
    sharey="row",
    sharex="col",
)
fig2, axs2 = plt.subplots(
    2,
    2,
    figsize=(10, 10),
    gridspec_kw={"width_ratios": [1, 1]},
    sharex=True,
    sharey=True,
)
df = dfs["percentage_fulfilled"]
df = df.drop(
    columns=[
        "ensemble GaMD",
        "ensemble cMD",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)

methods_label = {a: idx for idx, a in enumerate(df)}
methods_short = {
    " ".join(a.split("-")[0:2])
    .replace("value", "")
    .replace("basic", "Macrocycle"): idx
    for idx, a in enumerate(df)
}
# Scatter plot % fulfilled
for column in df:
    axs1[0, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
axs1[0, 0].set_ylabel("% NOE fulfilled [1/100 %]")
axs1[0, 0].set_xlabel("Compounds")
axs1[0, 0].set_ylim([0, 1])
fig1.autofmt_xdate(rotation=45, ha="center")

# Distributions
sns.boxplot(data=df, ax=axs1[0, 1], showmeans=True)
axs1[0, 1].set_title("Distributions")

for tick in axs1[1, 0].xaxis.get_major_ticks()[1::2]:
    tick.set_pad(25)

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[0, 0].set_title("Paired t-test (% NOE fulfilled)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[0, 1].set_title("Wilcoxon signed rank test \n(% NOE fulfilled)")


# Scatter plot RMSD
df = dfs["RMSD"]
df = df.drop(
    columns=[
        "ensemble GaMD",
        "ensemble cMD",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)

for column in df:
    axs1[1, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
handles, labels = axs1[1, 0].get_legend_handles_labels()
fig1.legend(handles, labels, loc="lower center")
axs1[1, 0].set_ylabel("RMSD [$\AA$]")
axs1[1, 0].set_xlabel("Compounds")

# Distributions
sns.boxplot(data=df, ax=axs1[1, 1], showmeans=True)
# axs1[1,1].set_title('Distributions')

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[1, 0].set_title("Paired t-test (RMSD)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[1, 1].set_title("Wilcoxon signed rank test (RMSD)")

fig1.autofmt_xdate(rotation=45, ha="center")
fig1.tight_layout()
fig1.savefig(snakemake.output.plot3, dpi=300)
fig2.tight_layout()
fig2.savefig(snakemake.output.plot3_sig, dpi=300)

In [11]:
fig1, axs1 = plt.subplots(
    2,
    2,
    figsize=(10, 6),
    gridspec_kw={"width_ratios": [3, 1]},
    sharey="row",
    sharex="col",
)
fig2, axs2 = plt.subplots(
    2,
    2,
    figsize=(10, 10),
    gridspec_kw={"width_ratios": [1, 1]},
    sharex=True,
    sharey=True,
)
df = dfs["percentage_fulfilled"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "RDKit best",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)

methods_label = {a: idx for idx, a in enumerate(df)}
methods_short = {
    " ".join(a.split("-")[0:2])
    .replace("value", "")
    .replace("basic", "Macrocycle"): idx
    for idx, a in enumerate(df)
}
# Scatter plot % fulfilled
for column in df:
    axs1[0, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
axs1[0, 0].set_ylabel("% NOE fulfilled [1/100 %]")
axs1[0, 0].set_xlabel("Compounds")
axs1[0, 0].set_ylim([0, 1])
fig1.autofmt_xdate(rotation=45, ha="center")

for tick in axs1[1, 0].xaxis.get_major_ticks()[1::2]:
    tick.set_pad(25)

# Distributions
sns.boxplot(data=df, ax=axs1[0, 1], showmeans=True)
axs1[0, 1].set_title("Distributions")

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[0, 0].set_title("Paired t-test (% NOE fulfilled)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[0, 1].set_title("Wilcoxon signed rank test \n(% NOE fulfilled)")


# Scatter plot RMSD
df = dfs["RMSD"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "RDKit best",
        "RDKit LICUV 10",
        "RDKit low_energy 10",
        "RDKit random 10",
        "RDKit NAMFIS 10",
    ]
)
for column in df:
    axs1[1, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
handles, labels = axs1[1, 0].get_legend_handles_labels()
fig1.legend(handles, labels, loc="lower center")
axs1[1, 0].set_ylabel("RMSD [$\AA$]")
axs1[1, 0].set_xlabel("Compounds")

# Distributions
sns.boxplot(data=df, ax=axs1[1, 1], showmeans=True)
# axs1[1,1].set_title('Distributions')

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[1, 0].set_title("Paired t-test (RMSD)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[1, 1].set_title("Wilcoxon signed rank test (RMSD)")

fig1.autofmt_xdate(rotation=45, ha="center")
fig1.tight_layout()
fig1.savefig(snakemake.output.plot4, dpi=300)
fig2.tight_layout()
fig2.savefig(snakemake.output.plot4_sig, dpi=300)

In [12]:
fig1, axs1 = plt.subplots(
    2,
    2,
    figsize=(10, 6),
    gridspec_kw={"width_ratios": [3, 1]},
    sharey="row",
    sharex="col",
)
fig2, axs2 = plt.subplots(
    2,
    2,
    figsize=(10, 10),
    gridspec_kw={"width_ratios": [1, 1]},
    sharex=True,
    sharey=True,
)
df = dfs["percentage_fulfilled"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "Omega best",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
    ]
)
methods_label = {a: idx for idx, a in enumerate(df)}
methods_short = {
    " ".join(a.split("-")[0:2])
    .replace("value", "")
    .replace("basic", "Macrocycle"): idx
    for idx, a in enumerate(df)
}
# Scatter plot % fulfilled
for column in df:
    axs1[0, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
axs1[0, 0].set_ylabel("% NOE fulfilled [1/100 %]")
axs1[0, 0].set_xlabel("Compounds")
axs1[0, 0].set_ylim([0, 1])
fig1.autofmt_xdate(rotation=45, ha="center")

# Distributions
sns.boxplot(data=df, ax=axs1[0, 1], showmeans=True)
axs1[0, 1].set_title("Distributions")

for tick in axs1[1, 0].xaxis.get_major_ticks()[1::2]:
    tick.set_pad(25)

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[0, 0].set_title("Paired t-test (% NOE fulfilled)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[0, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[0, 1].set_title("Wilcoxon signed rank test \n(% NOE fulfilled)")


# Scatter plot RMSD
df = dfs["RMSD"]
df = df.drop(
    columns=[
        "cluster GaMD",
        "cluster cMD",
        "Omega best",
        "Omega LICUV 10",
        "Omega low_energy 10",
        "Omega random 10",
        "Omega NAMFIS 10",
    ]
)
for column in df:
    axs1[1, 0].scatter(
        df.index.tolist(),
        df[column].tolist(),
        label=" ".join(column.split("-")[0:4]),
    )
handles, labels = axs1[1, 0].get_legend_handles_labels()
fig1.legend(handles, labels, loc="lower center")
axs1[1, 0].set_ylabel("RMSD [$\AA$]")
axs1[1, 0].set_xlabel("Compounds")

# Distributions
sns.boxplot(data=df, ax=axs1[1, 1], showmeans=True)
# axs1[1,1].set_title('Distributions')

# Significance heatmap
t_test_results = np.zeros(shape=(len(methods_label), len(methods_label)))
wilkoxon_results = np.zeros(shape=(len(methods_label), len(methods_label)))

for a in df:
    for b in df:
        a_idx = methods_label[a]
        b_idx = methods_label[b]
        if a == b:
            t_test_results[a_idx, b_idx] = 1
            wilkoxon_results[a_idx, b_idx] = 1
        else:
            t_test_results[a_idx, b_idx] = round(
                stats.ttest_rel(df[a], df[b], nan_policy="omit").pvalue, 3
            )
            wilkoxon_results[a_idx, b_idx] = round(
                stats.wilcoxon(df[a], df[b], nan_policy="omit").pvalue, 3
            )
colorpalette = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(
    t_test_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 0],
    annot=True,
    cmap=colorpalette,
    cbar=False,
)
axs2[1, 0].set_title("Paired t-test (RMSD)")
sns.heatmap(
    wilkoxon_results,
    vmin=0,
    vmax=0.1,
    xticklabels=methods_short.keys(),
    yticklabels=methods_short.keys(),
    ax=axs2[1, 1],
    annot=True,
    cmap=colorpalette,
)
axs2[1, 1].set_title("Wilcoxon signed rank test (RMSD)")

fig1.autofmt_xdate(rotation=45, ha="center")
fig1.tight_layout()
fig1.savefig(snakemake.output.plot5, dpi=300)
fig2.tight_layout()
fig2.savefig(snakemake.output.plot5_sig, dpi=300)

In [13]:
dfs["percentage_fulfilled"].describe()

In [14]:
dfs["RMSD"].describe()

In [15]:
# stats.ttest_rel(df['GaMD-H2O-2000-nan-'], df[f"{'-'.join(snakemake.wildcards.methods.split('-')[-2:])} best"], nan_policy='omit')
# stats.wilcoxon(df['GaMD-H2O-2000-nan-'], df[f"{'-'.join(snakemake.wildcards.methods.split('-')[-2:])} best"])
# stats.ttest_rel(df['GaMD-H2O-2000-nan-'], df['cMD-H2O-2000-nan-'], nan_policy='omit')
# stats.wilcoxon(df['GaMD-H2O-2000-nan-'], df['cMD-H2O-2000-nan-'], alternative='two-sided')
# stats.ttest_rel(df['cMD-H2O-2000-nan-'], df[f"{'-'.join(snakemake.wildcards.methods.split('-')[-2:])} best"], nan_policy='omit')
# stats.wilcoxon(df['cMD-H2O-2000-nan-'], df[f"{'-'.join(snakemake.wildcards.methods.split('-')[-2:])} best"], alternative='two-sided')