In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
from xai_ranking.benchmarks import (
    human_in_the_loop,
    hierarchical_ranking_explanation,
    lime_experiment,
    shap_experiment,
    sharp_experiment,
    participation_experiment,
)
from xai_ranking.datasets import (
    fetch_atp_data,
    fetch_csrank_data,
    fetch_higher_education_data,
    fetch_movers_data,
)
from xai_ranking.datasets.scorers import atp_score

RNG_SEED = 42

In [2]:
df = fetch_atp_data().head(20)
ranks = df.serve__standing_player
scores = df.serve__rating
X = df.drop(columns=["serve__standing_player", "serve__rating"])
df.head(5)

Unnamed: 0_level_0,serve__standing_player,serve__rating,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
John Isner,1,314.0,0.684,0.807,0.535,0.917,22.4,2.7
Nick Kyrgios,2,308.1,0.68,0.788,0.559,0.929,15.9,3.4
Reilly Opelka,3,307.1,0.652,0.795,0.578,0.908,15.9,2.1
Hubert Hurkacz,4,299.1,0.634,0.782,0.553,0.905,13.3,1.6
Matteo Berrettini,5,292.0,0.64,0.783,0.516,0.878,12.4,2.1


In [3]:
(X.values[0] * np.array([100, 100, 100, 100, 1, -1])).sum() == scores.values[0]

True

In [4]:
atp_score(X)

player_name
John Isner               314.0
Nick Kyrgios             308.1
Reilly Opelka            307.1
Hubert Hurkacz           299.1
Matteo Berrettini        292.0
Nicolas Jarry            291.8
Novak Djokovic           290.7
Maxime Cressy            289.3
Ben Shelton              286.6
Stefanos Tsitsipas       286.6
Felix Auger-Aliassime    286.1
Casper Ruud              284.1
Daniil Medvedev          283.1
Taylor Fritz             283.1
Brandon Nakashima        283.0
Marin Cilic              282.5
Arthur Rinderknech       281.2
Thiago Monteiro          281.0
Alexander Zverev         280.5
Andrey Rublev            280.1
dtype: float64

In [5]:
sharp_results = sharp_experiment(X, atp_score, random_state=RNG_SEED)
pd.DataFrame(sharp_results, columns=X.columns, index=X.index)

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,1.65,1.645,0.304167,2.098333,3.773333,-0.020833
Nick Kyrgios,1.569167,1.008333,1.411667,2.526667,2.261667,-0.3275
Reilly Opelka,0.385,1.115,1.928333,1.6825,2.0875,0.251667
Hubert Hurkacz,-0.265,0.835,1.428333,2.125833,1.715,0.610833
Matteo Berrettini,0.135833,1.360833,-0.349167,1.553333,2.175,0.574167
Nicolas Jarry,0.450833,3.3975,-2.364167,0.700833,1.9975,0.2675
Novak Djokovic,1.285,0.3675,2.821667,1.141667,-2.9475,0.781667
Maxime Cressy,-1.699167,1.095833,1.420833,1.880833,3.69,-3.938333
Ben Shelton,-4.3475,1.9575,1.489167,2.870833,0.1225,-1.6425
Stefanos Tsitsipas,-0.506667,-0.103333,2.7875,0.263333,-1.819167,0.828333


In [6]:
shap_results = shap_experiment(X, atp_score)
pd.DataFrame(shap_results, columns=X.columns, index=X.index)

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,4.13,3.485,0.53,4.66,11.545,0.15
Nick Kyrgios,3.73,1.585,2.93,5.86,5.045,-0.55
Reilly Opelka,0.93,2.285,4.83,3.76,5.045,0.75
Hubert Hurkacz,-0.87,0.985,2.33,3.46,2.445,1.25
Matteo Berrettini,-0.27,1.085,-1.37,0.76,1.545,0.75
Nicolas Jarry,0.23,3.985,-3.97,-0.04,1.545,0.55
Novak Djokovic,1.33,-0.015,3.23,0.66,-4.955,0.95
Maxime Cressy,-2.67,1.085,1.43,1.76,3.645,-5.45
Ben Shelton,-5.67,1.785,1.33,2.46,-0.855,-1.95
Stefanos Tsitsipas,-1.17,-0.715,2.23,-0.74,-3.255,0.75


In [9]:
hre_results = hierarchical_ranking_explanation(X, atp_score, model_type="OLS", s=int(X.shape[0]**0.25))
hre_results

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,75.412016,103.572058,93.253192,117.922965,1.030172,-0.839311
Nick Kyrgios,73.417122,102.96984,100.601348,113.91784,1.075709,-0.627492
Reilly Opelka,76.793557,112.482442,97.781151,105.634787,1.027775,-0.556409
Hubert Hurkacz,109.613007,94.412972,92.015407,102.036759,1.11018,-1.386733
Matteo Berrettini,79.284704,102.607108,102.281254,110.346985,0.932986,-0.149706
Nicolas Jarry,90.98717,95.033613,86.708423,120.318177,0.888099,-0.969731
Novak Djokovic,86.628023,97.985727,89.230108,119.403573,0.891216,-0.997574
Maxime Cressy,100.902848,89.252386,96.598468,110.7998,1.04564,-1.065343
Ben Shelton,99.707071,92.041871,100.163788,106.966666,1.034521,-1.043903
Stefanos Tsitsipas,99.89061,92.152434,99.928127,106.931688,1.026913,-1.035476


In [12]:
hilw = human_in_the_loop(X, atp_score)
hilw

Index(['serve__pct_1st_serve', 'serve__pct_1st_serve_points_won',
       'serve__pct_2nd_serve_points_won', 'serve__pct_service_games_won',
       'serve__avg_aces_match', 'serve__avg_double_faultsmatch'],
      dtype='object')


Unnamed: 0,serve__pct_1st_serve_contri,serve__pct_1st_serve_points_won_contri,serve__pct_2nd_serve_points_won_contri,serve__pct_service_games_won_contri,serve__avg_aces_match_contri,serve__avg_double_faultsmatch_contri
0,0.0413,0.03485,0.0053,0.0466,11.545,0.15
1,0.0373,0.01585,0.0293,0.0586,5.045,0.55
2,0.0093,0.02285,0.0483,0.0376,5.045,0.75
3,0.0087,0.00985,0.0233,0.0346,2.445,1.25
4,0.0027,0.01085,0.0137,0.0076,1.545,0.75
5,0.0023,0.03985,0.0397,0.0004,1.545,0.55
6,0.0133,0.00015,0.0323,0.0066,4.955,0.95
7,0.0267,0.01085,0.0143,0.0176,3.645,5.45
8,0.0567,0.01785,0.0133,0.0246,0.855,1.95
9,0.0117,0.00715,0.0223,0.0074,3.255,0.75


In [18]:
part_results = participation_experiment(X, atp_score, top_k=5)
part_results

serve__pct_1st_serve               0.031744
serve__pct_1st_serve_points_won    0.038275
serve__pct_2nd_serve_points_won    0.026543
serve__pct_service_games_won       0.043887
serve__avg_aces_match              0.747419
serve__avg_double_faultsmatch      0.112131
dtype: float64

# Test using DuckDB

Ignore this section. This is just a test to see if DuckDB is worth exploring in the future. Based on some talks I saw at SIGMOD.

In [7]:
X.reset_index(inplace=True)
scores = scores.to_frame()
scores.reset_index(inplace=True)

In [8]:
import duckdb as db

weights = {k: v for k, v in zip(X.columns.drop("player_name"), [100, 100, 100, 100, 1, -1])}
rename_cols = ", ".join([f"{col} as {col.replace("serve__", "")}" for col in X.columns.drop("player_name")])
db.sql(
f"""
SELECT 
    X.player_name,
    {rename_cols},
    {"+".join(f"{k}*{v}" for k, v in weights.items())} as score,
    ROW_NUMBER() OVER (ORDER BY score DESC) as rank
FROM X 
ORDER BY score DESC
"""
).df()

Unnamed: 0,player_name,pct_1st_serve,pct_1st_serve_points_won,pct_2nd_serve_points_won,pct_service_games_won,avg_aces_match,avg_double_faultsmatch,score,rank
0,John Isner,0.684,0.807,0.535,0.917,22.4,2.7,314.0,1
1,Nick Kyrgios,0.68,0.788,0.559,0.929,15.9,3.4,308.1,2
2,Reilly Opelka,0.652,0.795,0.578,0.908,15.9,2.1,307.1,3
3,Hubert Hurkacz,0.634,0.782,0.553,0.905,13.3,1.6,299.1,4
4,Matteo Berrettini,0.64,0.783,0.516,0.878,12.4,2.1,292.0,5
5,Nicolas Jarry,0.645,0.812,0.49,0.87,12.4,2.3,291.8,6
6,Novak Djokovic,0.656,0.772,0.562,0.877,5.9,1.9,290.7,7
7,Maxime Cressy,0.616,0.783,0.544,0.888,14.5,8.3,289.3,8
8,Stefanos Tsitsipas,0.631,0.765,0.552,0.863,7.6,2.1,286.6,9
9,Ben Shelton,0.586,0.79,0.543,0.895,10.0,4.8,286.6,10


In [9]:
def test_ddb():
    rename_cols = ", ".join([f"{col} as {col.replace("serve__", "")}" for col in X.columns])
    return db.sql(
    f"""
    SELECT 
        {rename_cols},
        {"+".join(f"{k}*{v}" for k, v in weights.items())} as score,
        ROW_NUMBER() OVER (ORDER BY score DESC) as rank
    FROM X 
    ORDER BY score DESC
    """
    )

In [10]:
from sharp.utils import scores_to_ordering

def test_pd():
    X_ = X.copy()
    X_.rename(columns={col: col.replace("serve__", "") for col in X_.columns}, inplace=True)
    X_["score"] = atp_score(X_.drop(columns=["player_name"]))
    X_.sort_values("score", ascending=False, inplace=True)
    X_["rank"] = scores_to_ordering(X_["score"])
    return X_

In [11]:
%timeit test_ddb()
%timeit test_pd()

2.63 ms ± 509 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.02 ms ± 251 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
