In [2]:
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from sharp import ShaRP

sys.path.append("..")

from itertools import product
from lightgbm import LGBMRanker
from sklearn.utils import check_random_state
from xai_ranking.benchmarks import (
    human_in_the_loop_experiment,
    human_in_the_loop_batch_experiment,
    hierarchical_ranking_explanation,
    hierarchical_ranking_batch_explanation,
    lime_experiment,
    lime_batch_experiment,
    shap_experiment,
    shap_batch_experiment,
    sharp_experiment,
    sharp_batch_experiment,
    # participation_experiment,
)
from xai_ranking.preprocessing import (
    preprocess_atp_data,
    preprocess_csrank_data,
    preprocess_higher_education_data,
    preprocess_movers_data,
    preprocess_synthetic_data,
)
from xai_ranking.datasets import (
    fetch_atp_data,
    fetch_csrank_data,
    fetch_higher_education_data,
    fetch_movers_data,
    fetch_synthetic_data,
)
from xai_ranking.scorers import (
    atp_score,
    csrank_score,
    higher_education_score,
    synthetic_equal_score_3ftrs,
)
from xai_ranking.metrics._base import row_wise_kendall
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from mlresearch.utils import check_random_states, set_matplotlib_style
from xai_ranking.utils import scores_to_ordering

RNG_SEED = 42

In [3]:
datasets = [
    {
        "name": "ATP",
        "data": preprocess_atp_data(fetch_atp_data()),
        "scorer": atp_score,
    },
    {
        "name": "CSRank",
        "data": preprocess_csrank_data(fetch_csrank_data()),
        "scorer": csrank_score,
    },
    {
        "name": "Higher Education",
        "data": preprocess_higher_education_data(
            fetch_higher_education_data(year=2020)
        ),
        "scorer": higher_education_score,
    },
    # {
    #     "name": "Moving Company",
    #     "data": preprocess_movers_data(fetch_movers_data(test=True)),
    #     "scorer": model.predict,
    # },
    {
        "name": "Synthetic_0",
        "data": preprocess_synthetic_data(
            fetch_synthetic_data(synth_dt_version=0, item_num=2000)
        ),
        "scorer": synthetic_equal_score_3ftrs,
    },
    {
        "name": "Synthetic_1",
        "data": preprocess_synthetic_data(
            fetch_synthetic_data(synth_dt_version=1, item_num=2000)
        ),
        "scorer": synthetic_equal_score_3ftrs,
    },
    {
        "name": "Synthetic_2",
        "data": preprocess_synthetic_data(
            fetch_synthetic_data(synth_dt_version=2, item_num=2000)
        ),
        "scorer": synthetic_equal_score_3ftrs,
    },
]
xai_methods = [
    {
        "iterations": 1,
        "name": "ShaRP_RANK",
        "experiment": sharp_experiment,
        "kwargs": {
            "qoi": "rank",
            "verbose": True,
            "sample_size": None,
            "measure": "shapley",
            "n_jobs": -1,
            "replace": False,
        },
    },
    {
        "iterations": 1,
        "name": "ShaRP_SCORE",
        "experiment": sharp_experiment,
        "kwargs": {
            "qoi": "rank_score",
            "verbose": True,
            "sample_size": None,
            "measure": "shapley",
            "n_jobs": -1,
            "replace": False,
        },
    },
]

In [13]:
def read_results_from_files():
    return_dict = {}
    for cur_dataset in datasets:
        return_dict[cur_dataset["name"]] = {}
        for cur_xai_method in xai_methods:
            return_dict[cur_dataset["name"]][cur_xai_method["name"]] = []
            for iteration in range(cur_xai_method["iterations"]):
                fname = (
                    f"results/contributions/_contributions_"
                    f"{cur_dataset['name']}_{cur_xai_method['name']}_"
                    f"{iteration}.csv"
                )
                if os.path.isfile(fname):
                    (
                        return_dict[cur_dataset["name"]][cur_xai_method["name"]].append(
                            pd.read_csv(fname, index_col=0)
                        )
                    )
    return return_dict

In [14]:
results = read_results_from_files()

In [15]:
# results

In [22]:
methods = [method for method in results["ATP"].keys() if not method.startswith("BATCH")]

for dataset in datasets:
    print(dataset["name"])
    df_rank = results[dataset["name"]]["ShaRP_RANK"][0]
    df_score = results[dataset["name"]]["ShaRP_SCORE"][0]
    scores = dataset["scorer"](dataset["data"][0])
    rank = scores_to_ordering(dataset["scorer"](dataset["data"][0]))
    # print(type(rank))
    xai = ShaRP(
        qoi="rank",
        target_function=dataset["scorer"],
        measure="shapley",
        sample_size=None,
        replace=False,
        random_state=42,
    )
    xai.fit(dataset["data"][0])
    xai_score = ShaRP(
        qoi="rank_score",
        target_function=dataset["scorer"],
        measure="shapley",
        sample_size=None,
        replace=False,
        random_state=42,
    )
    xai_score.fit(dataset["data"][0])
    mismatch = 0
    for index, row in df_rank.iterrows():
        kendall = row_wise_kendall(row.to_numpy(), df_score.loc[[index]].to_numpy()[0])
        if kendall != 1.0:
            print(index, " ", kendall)
            mismatch += 1
        if kendall < 0.7:

            # WATERFALL RANK
            print(rank.mean())
            xai.plot.waterfall(
                row.to_numpy(),
                feature_values=dataset["data"][0].loc[[index]].to_numpy()[0],
                mean_target_value=rank.mean(),
            )
            plt.savefig(
                f"results/QoI/waterfall-{dataset["name"]}-rank-{index}.pdf",
                format="pdf",
                bbox_inches="tight",
                transparent=True,
            )
            plt.close()

            # WATERFALL SCORE
            xai_score.plot.waterfall(
                df_score.loc[[index]].to_numpy()[0],
                feature_values=dataset["data"][0].loc[[index]].to_numpy()[0],
                mean_target_value=scores.mean(),
            )
            plt.savefig(
                f"results/QoI/waterfall-{dataset["name"]}-score-{index}.pdf",
                format="pdf",
                bbox_inches="tight",
                transparent=True,
            )
            plt.close()

    print("\n")
    print(
        "Mismatch:",
        mismatch,
        "/",
        results[dataset["name"]]["ShaRP_RANK"][0].shape[0],
        " (",
        float(mismatch) / results[dataset["name"]]["ShaRP_RANK"][0].shape[0] * 100,
        ")",
    )

    print("\n\n")

ATP
Nick Kyrgios   0.9333333333333333
Casper Ruud   0.9333333333333333
Taylor Fritz   0.9333333333333333
Brandon Nakashima   0.8666666666666667
Arthur Rinderknech   0.9333333333333333
Thiago Monteiro   0.8
Pablo Carreno Busta   0.9333333333333333
Rafael Nadal   0.9333333333333333
Borna Coric   0.9333333333333333
Roberto Bautista Agut   0.9333333333333333
Grigor Dimitrov   0.8666666666666667
Holger Rune   0.9333333333333333
Frances Tiafoe   0.8666666666666667
Tallon Griekspoor   0.8
Miomir Kecmanovic   0.9333333333333333
Alexander Bublik   0.8666666666666667
Sebastian Korda   0.8
Andy Murray   0.9333333333333333
Daniel Evans   0.9333333333333333
Tommy Paul   0.8666666666666667
Dominic Thiem   0.9333333333333333
Mackenzie McDonald   0.9333333333333333
Ilya Ivashka   0.9333333333333333
Filip Krajinovic   0.9333333333333333
Botic van de Zandschulp   0.9333333333333333
Marcos Giron   0.9333333333333333
Marton Fucsovics   0.9333333333333333
Ugo Humbert   0.9333333333333333
Richard Gasquet   

In [None]:
df