In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from yaml import safe_load

In [2]:
def get_scores(df_cols):
    return list(filter(lambda x: "test_" in x, df_cols.columns))


def load_yaml_file(model_config):
    with open(model_config) as f:
        loaded_yaml = safe_load(f)
    return loaded_yaml


def to_long_form(df_report, score_list, fold_num=5, score="score", derived=False):
    repeated_columns = ["task_name", "task_group", "model_name"]
    if derived:
        repeated_columns.append("sub_task")
    cols = repeated_columns + [
        "fold",
        "metric",
        score,
    ]
    long_df = pd.DataFrame(
        index=range(df_report.shape[0] * fold_num * len(score_list)), columns=cols
    )
    idx = 0
    for _, df_row in df_report.iterrows():
        for curr_score in score_list:
            for fold, value in enumerate(df_row[curr_score].split(",")):
                long_df.iloc[idx, :] = list(df_row[repeated_columns]) + [
                    fold,
                    curr_score,
                    float(value),
                ]
                idx = idx + 1
    return long_df


def get_log_from_df(file_name, family_dict, task_type, derived=False):
    res_df = pd.read_csv(file_name)
    res_df["task_group"] = res_df["task_name"].apply(
        lambda x: None if x not in family_dict else family_dict[x]
    )

    res_df = to_long_form(res_df, get_scores(res_df), derived=derived)
    res_df["task_type"] = task_type
    res_df["score"] = res_df["score"].astype(float)
    res_df["metric"] = res_df["metric"].apply(lambda x: x.replace("test_", ""))
    res_df["metric"] = res_df["metric"].apply(lambda x: x.replace("_weighted", ""))
    res_df["metric"] = res_df["metric"].apply(lambda x: x.replace("_ovr", ""))
    return res_df


def mean_std_string(vals):
    mean_val = np.mean(vals)
    if np.isnan(mean_val):
        return "-"
    elif mean_val > 1:
        f"{np.mean(vals):.0f} ({np.std(vals):.0f})"
    else:
        return f"{np.mean(vals):.2f} ({np.std(vals):.2f})"


model_type = {
    "mpnet": "LLM",
    "bag_of_words": "Classical ML",
    "GEARS": "ScRNA-seq",
    "ScGPT": "ScRNA-seq",
    "top_mteb": "LLM",
    "geneformer": "ScRNA-seq",
    "cellPLM": "ScRNA-seq",
    "cellPT": "ScRNA-seq",
    "gene2vec": "Classical ML",
    "mistral": "LLM",
}
model_name = {
    "mpnet": "MPNet",
    "bag_of_words": "Bag of Words",
    "GEARS": "GEARS",
    "ScGPT": "ScGPT",
    "top_mteb": "MTEB-1B",
    "geneformer": "Geneformer",
    "cellPLM": "cellPLM",
    "cellPT": "cellPLM",
    "gene2vec": "Gene2vec",
    "mistral": "MTEB-7B",
}

In [3]:
base_folder = Path("/dccstor/bmfm-targets/text_models/manuscript")
binary_report = base_folder / "performance/binary_tasks.csv"
multi_label_report = base_folder / "performance/multi_label_tasks.csv"
regression_report = base_folder / "performance/regression_tasks.csv"
category_report = base_folder / "performance/categorical_tasks.csv"
derived_report = base_folder / "performance/binary_tasks_from_multi_label.csv"
task_family = base_folder / "task_family_dict.yaml"

task_group = load_yaml_file(task_family)
binary_long = get_log_from_df(binary_report, task_group, "binary")
regression_long = get_log_from_df(regression_report, task_group, "regression")
category_long = get_log_from_df(category_report, task_group, "category")
multi_long = get_log_from_df(multi_label_report, task_group, "multi label")
derived_long = get_log_from_df(derived_report, task_group, "derived bin", derived=True)


multi_long["metric"] = multi_long["metric"].apply(
    lambda x: x if x != "auc" else "roc_auc"
)
multi_long["metric"] = multi_long["metric"].apply(
    lambda x: x if x != "hamming_loss" else "hamming"
)
bin_cat_df = pd.concat([binary_long, category_long, multi_long])

bin_cat_df["Model family"] = bin_cat_df["model_name"].apply(lambda x: model_type[x])
bin_cat_df["Model"] = bin_cat_df["model_name"].apply(lambda x: model_name[x])


derived_long["Model family"] = derived_long["model_name"].apply(lambda x: model_type[x])
derived_long["Model"] = derived_long["model_name"].apply(lambda x: model_name[x])

bin_cat_df.to_csv(
    "/dccstor/bmfm-targets/text_models/manuscript/performance/results_binary_categorical_multi_long_format.csv",
    index=False,
)
derived_long.to_csv(
    "/dccstor/bmfm-targets/text_models/manuscript/performance/results_derived_binary_long_format.csv",
    index=False,
)