# Plotting Few-Shot Model Evaluation Results

Assembling plots from summary files.

In [None]:
import os
import sys
import pandas as pd

# Setting up local details:
# This should be the location of the checkout of the FS-Mol repository:
FS_MOL_CHECKOUT_PATH = os.path.join(os.environ['HOME'], "fsl/git_repos/few_shot_drug")
FS_MOL_DATASET_PATH = os.path.join(os.environ['HOME'], "fsl/git_repos/datasets/few_shot_drug")


os.chdir(FS_MOL_CHECKOUT_PATH)
sys.path.insert(0, FS_MOL_CHECKOUT_PATH)
PAPER_FIGDIR = "../paper/Paper/fig"
from fs_mol.plotting.utils import (
    highlight_max_all, 
    #plot_all_assays, 
    load_data,
    expand_values,
    #plot_task_performances_by_id,
    #box_plot,
    #plot_by_size,
    get_aggregates_across_sizes
)

## Making summary files

Summary files are obtained by running `fs_mol/plotting/collect_eval_runs.py` on the outputs of evaluation runs. If an evaluation output directory is "evaluation_output_directory" then summary files are created with:

The option `--plot` results in a plot across support set sizes for each few-shot testing task. Final summarized results will be found in "evaluation_output_directory/summary/{model_name}_summary.csv"

## Loading the collated evaluation data

Create a dictionary of all model summary .csvs to be compared. The csvs are the final summaries from `collect_eval_runs.py`

In [None]:
# Configure this to contain all the models that you want to look at.
# Dict keys are human readable names, values are the path to the summary produced by collect_eval_runs.py

results_path = os.path.join("/home/philippe/fsl/git_repos/baselines")

# a dictionary summarising all models to be compared. Add new paths here as desired.
model_summaries = {
    "ADKF-IFT": os.path.join(results_path, "ADKF-IFT_classification_summary.csv"),
    "Q-probe": os.path.join(results_path, "qprobe_summary.csv"),
    "L-probe": os.path.join(results_path, "lp_summary.csv"),
    "PN": os.path.join(results_path, "ProtoNet-gnn+ecfp+fc-Support64_summary.csv"),
    "mhnfs" : os.path.join(results_path, "mhnfs_summary.csv"),
    "clamp": os.path.join(results_path, "clamp_summary.csv"),
    "CNP": os.path.join(results_path, "CNP_classification_summary.csv"),
    "GNN-MAML": os.path.join(results_path, "MAML-Support16_summary.csv"),
    "SimSearch": os.path.join(results_path, "SimSearch_summary.csv"),
    "PAR": os.path.join(results_path, "PAR_classification_summary.csv"),
    "GNN-MT": os.path.join(results_path, "GNN-Multitask_summary.csv"),
    "MAT": os.path.join(results_path, "MAT_summary.csv"),
    "GNN-ST": os.path.join(results_path, "GNN-ST_summary.csv"),
    "RF": os.path.join(results_path, "random_forest_summary.csv"),
    "kNN": os.path.join(results_path, "kNN_summary.csv"),
}
# Generated plots will be stored here, if you want to keep them. None disables saving.

plot_output_dir = os.path.join(results_path, "plots")
all_tasks_output_dir = os.path.join(results_path, "plots/all_tasks")
os.makedirs(plot_output_dir, exist_ok=True)
os.makedirs(all_tasks_output_dir, exist_ok=True)

data = load_data(model_summaries)
data

## Highlight the best result for each task

In [None]:
# expand out from val +/- error format, and calculate delta AUPRC
data = expand_values(data, model_summaries)

### Incorporate protein information

Our test tasks have associated target protein information available. We can merge this data to allow plotting with specific EC number classes highlighted.

In [None]:
protein_path = os.path.join(FS_MOL_CHECKOUT_PATH, "datasets/targets", "test_proteins.csv")
ecs = pd.read_csv(protein_path)
ecs["target_id"] = ecs["target_id"].astype(int).astype(str)
ecs["chembl_id"] = ecs["chembl_id"].astype(str)
ecs["TASK_ID"] = ecs.apply(lambda row: row["chembl_id"][6:], axis = 1)
data = ecs.merge(data, on="TASK_ID")

## Aggregate as a function of the number of training points, across all categories

Here the results are aggregated according to EC class, and across all classes. This is used to plot the variation of performance with support set size, comparing all models in the model_summaries dictionary.

In [None]:
aggregate_df = get_aggregates_across_sizes(data, model_summaries)

In [None]:
# show results of Table 2 for support set size 16
aggregate_df

In [None]:
# this function has the option to plot all classes separately.
#autoreload magic line
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
from fs_mol.plotting.utils import plot_by_size
fig=plot_by_size(aggregate_df, model_summaries, plot_all_classes = False)

In [None]:

df = pd.DataFrame(columns=["model", "support_set_size", "delta-auprc", "EC_category"])
for col in aggregate_df.columns:
    if not "std" in col:
        supp_size = int(col.split("_")[0])
        model_name = col.split("(")[1][:-1]
        for index in aggregate_df.index:
            df.loc[df.size] = {"model": model_name, "support_set_size": supp_size, "delta-auprc": aggregate_df[col][index], "EC_category":index}



# Ranking

Here we use [autorank](https://pypi.org/project/autorank/) for an appropriate comparison between all methods when evaluated on multiple tasks.


In [None]:
from autorank import autorank
import matplotlib.pyplot as plt
import seaborn as sns

# select correct data to rank with autorank
df_result = pd.DataFrame()
for size in [16,32,64,128]:
    df = data[[x for x in list(data.columns) if x.startswith(f"{size}") and "val" in x and "delta-auprc" in x]]
    ranked_df = autorank(df, alpha=0.1, verbose=False).rankdf
    result = pd.DataFrame(ranked_df.meanrank).reset_index(names=["model"])
    result["model"] = result["model"].apply(lambda x: x.split("(")[1].split(")")[0].replace(" ", ""))
    result["size"] = size
    df_result = pd.concat([df_result, result])


In [None]:
ft_palette=sns.cubehelix_palette(start=0., rot=0.6, dark=0.45, light=.8, as_cmap=True, reverse=True, hue=1.)
meta_palette=sns.cubehelix_palette(start=1.7, rot=0.6, dark=0.45, light=.8, as_cmap=True, reverse=True, hue=1.)
baseline_palette=sns.cubehelix_palette(start=1.3, rot=0.2, dark=0.6, light=.8, as_cmap=True, reverse=True)

ft_palette

In [None]:
meta_palette

In [None]:
baseline_palette

In [None]:
%matplotlib inline
fig, ax = plt.subplots(figsize=(6, 7))
sns.set_style("white")
fine_tuning_methods = ["GNN-MT","clamp", "L-probe", "Q-probe", "MAT"]
meta_methods = ["PAR", "GNN-MAML","PN","ADKF-IFT", "CNP", "mhnfs"]
bsl_methods = ["SimSearch", "RF",]

df_result["type"] = df_result["model"].apply(lambda x: "fine-tuning" if x in fine_tuning_methods else ("meta-learning" if x in meta_methods else "baselines"))
DOT_SIZE = 300
LINE_WIDTH = 3
alpha = 0.7

method_palette = [baseline_palette, meta_palette, ft_palette]

markers_all_methods = {
    "GNN-MT": "d",
    "clamp": "8",
    "L-probe": "X",
    "Q-probe": "*",
    "MAT": "<",
    "PAR": ">",
    "GNN-MAML": "D",
    "PN": "P",
    "ADKF-IFT": "h",
    "mhnfs": "H",
    "CNP": "s",
    "SimSearch": "p",
    "RF": "o",
}


for methods,palette in zip([bsl_methods, meta_methods, fine_tuning_methods, ], method_palette):
    # Plot lineplot for each model with x=size, y=mearank and the hue corresponds to the average meanrank of the model
    df = df_result[df_result.model.isin(methods)].sort_values("meanrank")
    average_meanrank = df.groupby("model").meanrank.mean().reset_index().rename(columns={"meanrank": "mean_meanrank"})
    df = df.join(average_meanrank.set_index("model"), on="model", how="outer").sort_values("mean_meanrank")

    sns.lineplot(x="size", y="meanrank", hue="mean_meanrank", data=df, alpha=alpha, legend=False, ax=ax, palette=palette, linewidth = LINE_WIDTH, estimator=None, units="model")
    sns.scatterplot(x="size", y="meanrank", style="model", data=df,ax=ax, s=DOT_SIZE, hue = "mean_meanrank", palette=palette, markers=markers_all_methods, legend=False)


# Custom legend with all markers
# place in the legend if the method is meta-learning, fine-tuning or baseline
legend_elements = []
from matplotlib.lines import Line2D

first_of_each = {
    "fine-tuning": True,
    "meta-learning": True,
    "baselines": True,
}
for model, marker in markers_all_methods.items():
    type = ""
    if model in fine_tuning_methods:
        color = ft_palette(0)
        type = "fine-tuning"
    elif model in meta_methods:
        color = meta_palette(0)
        type = "meta-learning"
    else:
        color = baseline_palette(0)
        type = "baselines"
    if first_of_each[type]:
        #legend_elements.append(Line2D([], [], marker="", color="w", label=type.upper(), markersize=0, linestyle='None'))
        first_of_each[type] = False
    legend_elements.append(Line2D([0], [0], marker=marker, color=color, label=model, markersize=15, linestyle='None'))
ax.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size': 15})




# place in the legend if the method is meta-learning, fine-tuning or baseline
#ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


#Move legend to the right
#ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

ax.set_xlabel("$|\mathcal{S}|$")
ax.set_ylabel("Average rank")

ax.set_xticks([16,32,64,128])
ax.invert_yaxis()
plt.grid(True)
plt.savefig(PAPER_FIGDIR+"/ranking.pdf", bbox_inches='tight')
plt.show()

In [None]:
df_result = pd.DataFrame()
for size in [16,32,64,128]:
    df = data[[x for x in list(data.columns) if x.startswith(f"{size}") and "val" in x and "delta-auprc" in x]]
    result = pd.DataFrame(autorank(df, verbose=False).rankdf.meanrank).reset_index(names=["model"])
    result["model"] = result["model"].apply(lambda x: x.split("(")[1].split(")")[0].replace(" ", ""))
    result[f"meanrank_{size}"] = result["meanrank"]
    result = result.drop(columns=["meanrank"])
    if len(df_result)==0:
        df_result = result
    else:
        df_result = df_result.join(result.set_index("model"), on="model", how="outer")

In [None]:
df_result

In [None]:
# Styled df for latx paper, with precision 3

In [None]:
aggregate_df