In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
from xai_ranking.benchmarks import (
    human_in_the_loop,
    hierarchical_ranking_explanation,
    lime_experiment,
    shap_experiment,
    sharp_experiment,
    participation_experiment,
)
from xai_ranking.preprocessing import (
    preprocess_atp_data,
)
from xai_ranking.datasets import (
    fetch_atp_data,
)
from xai_ranking.scorers import atp_score

RNG_SEED = 42

In [2]:
df = fetch_atp_data().head(20)
X, ranks, scores = preprocess_atp_data(df)
df.head(5)

Unnamed: 0_level_0,serve__standing_player,serve__rating,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
John Isner,1,314.0,0.684,0.807,0.535,0.917,22.4,2.7
Nick Kyrgios,2,308.1,0.68,0.788,0.559,0.929,15.9,3.4
Reilly Opelka,3,307.1,0.652,0.795,0.578,0.908,15.9,2.1
Hubert Hurkacz,4,299.1,0.634,0.782,0.553,0.905,13.3,1.6
Matteo Berrettini,5,292.0,0.64,0.783,0.516,0.878,12.4,2.1


In [3]:
sharp_results = sharp_experiment(X, atp_score, random_state=RNG_SEED)
pd.DataFrame(sharp_results, columns=X.columns, index=X.index)

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,1.65,1.645,0.304167,2.098333,3.773333,-0.020833
Nick Kyrgios,1.569167,1.008333,1.411667,2.526667,2.261667,-0.3275
Reilly Opelka,0.385,1.115,1.928333,1.6825,2.0875,0.251667
Hubert Hurkacz,-0.265,0.835,1.428333,2.125833,1.715,0.610833
Matteo Berrettini,0.135833,1.360833,-0.349167,1.553333,2.175,0.574167
Nicolas Jarry,0.450833,3.3975,-2.364167,0.700833,1.9975,0.2675
Novak Djokovic,1.285,0.3675,2.821667,1.141667,-2.9475,0.781667
Maxime Cressy,-1.699167,1.095833,1.420833,1.880833,3.69,-3.938333
Ben Shelton,-4.3475,1.9575,1.489167,2.870833,0.1225,-1.6425
Stefanos Tsitsipas,-0.506667,-0.103333,2.7875,0.263333,-1.819167,0.828333


In [4]:
shap_results = shap_experiment(X, atp_score)
pd.DataFrame(shap_results, columns=X.columns, index=X.index)

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,4.13,3.485,0.53,4.66,11.545,0.15
Nick Kyrgios,3.73,1.585,2.93,5.86,5.045,-0.55
Reilly Opelka,0.93,2.285,4.83,3.76,5.045,0.75
Hubert Hurkacz,-0.87,0.985,2.33,3.46,2.445,1.25
Matteo Berrettini,-0.27,1.085,-1.37,0.76,1.545,0.75
Nicolas Jarry,0.23,3.985,-3.97,-0.04,1.545,0.55
Novak Djokovic,1.33,-0.015,3.23,0.66,-4.955,0.95
Maxime Cressy,-2.67,1.085,1.43,1.76,3.645,-5.45
Ben Shelton,-5.67,1.785,1.33,2.46,-0.855,-1.95
Stefanos Tsitsipas,-1.17,-0.715,2.23,-0.74,-3.255,0.75


In [5]:
hre_results = hierarchical_ranking_explanation(
    X, atp_score, model_type="OLS", s=int(X.shape[0] ** 0.25)
)
pd.DataFrame(hre_results, columns=X.columns, index=X.index)

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,65.983374,144.964489,83.189764,95.5613,90.547516,-19.920372
Nick Kyrgios,70.81964,139.888205,90.498041,91.147663,94.119316,-15.259279
Reilly Opelka,58.079907,122.54827,95.991874,110.178674,105.017299,-19.869697
Hubert Hurkacz,74.609,114.757114,121.089842,94.620272,70.897949,2.14839
Matteo Berrettini,74.25891,103.239626,102.834715,112.857295,91.672754,5.658624
Nicolas Jarry,77.005905,87.329443,66.089699,151.837002,71.450999,-92.277594
Novak Djokovic,69.889292,95.464307,75.748608,143.692514,75.504188,-99.453727
Maxime Cressy,97.464367,130.18448,109.553141,69.668957,87.182034,-81.648617
Ben Shelton,101.375291,137.363079,99.231022,67.291797,83.792531,-79.387701
Stefanos Tsitsipas,100.434406,131.163975,100.28542,72.473127,89.312324,-85.911784


In [6]:
hilw_results = human_in_the_loop(X, atp_score)
pd.DataFrame(hilw_results, columns=X.columns, index=X.index)

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,0.0413,0.03485,0.0053,0.0466,0.11545,0.0015
Nick Kyrgios,0.0373,0.01585,0.0293,0.0586,0.05045,0.0055
Reilly Opelka,0.0093,0.02285,0.0483,0.0376,0.05045,0.0075
Hubert Hurkacz,0.0087,0.00985,0.0233,0.0346,0.02445,0.0125
Matteo Berrettini,0.0027,0.01085,0.0137,0.0076,0.01545,0.0075
Nicolas Jarry,0.0023,0.03985,0.0397,0.0004,0.01545,0.0055
Novak Djokovic,0.0133,0.00015,0.0323,0.0066,0.04955,0.0095
Maxime Cressy,0.0267,0.01085,0.0143,0.0176,0.03645,0.0545
Ben Shelton,0.0567,0.01785,0.0133,0.0246,0.00855,0.0195
Stefanos Tsitsipas,0.0117,0.00715,0.0223,0.0074,0.03255,0.0075


In [7]:
lime_results = lime_experiment(X, atp_score, mode="regression")
pd.DataFrame(lime_results, columns=X.columns, index=X.index)

Unnamed: 0_level_0,serve__pct_1st_serve,serve__pct_1st_serve_points_won,serve__pct_2nd_serve_points_won,serve__pct_service_games_won,serve__avg_aces_match,serve__avg_double_faultsmatch
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
John Isner,4.699627,3.699046,1.654743,5.731958,7.794807,-0.003723
Nick Kyrgios,4.647442,3.628865,4.295213,5.350094,7.802488,0.014947
Reilly Opelka,1.042166,3.622874,4.503694,5.204197,7.892536,0.891626
Hubert Hurkacz,-0.950179,1.24206,4.401785,5.108478,8.128211,1.054785
Matteo Berrettini,-0.734428,0.765472,-1.08911,0.996368,0.455849,1.024456
Nicolas Jarry,-0.656355,3.384196,-4.371337,0.689226,0.913979,0.17225
Novak Djokovic,0.875427,1.128502,4.361078,0.706667,-5.637457,1.394426
Maxime Cressy,-5.127551,0.854004,1.613075,0.658774,7.669272,-3.361393
Ben Shelton,-5.184112,3.638113,1.541226,5.195636,-2.656123,-3.325785
Stefanos Tsitsipas,-0.667831,-0.84827,4.279559,-1.758309,-3.1144,1.168301


In [8]:
part_results = participation_experiment(X, atp_score, top_k=5)
part_results

serve__pct_1st_serve               0.213067
serve__pct_1st_serve_points_won    0.256262
serve__pct_2nd_serve_points_won    0.177566
serve__pct_service_games_won       0.293918
serve__avg_aces_match              0.051512
serve__avg_double_faultsmatch      0.007676
dtype: float64

# Test using DuckDB

Ignore this section. This is just a test to see if DuckDB is worth exploring in the future. Based on some talks I saw at SIGMOD.

In [9]:
X.reset_index(inplace=True)
scores = scores.to_frame()
scores.reset_index(inplace=True)

In [10]:
import duckdb as db

weights = {
    k: v for k, v in zip(X.columns.drop("player_name"), [100, 100, 100, 100, 1, -1])
}
rename_cols = ", ".join(
    [f"{col} as" + col.replace("serve__", "") for col in X.columns.drop("player_name")]
)
db.sql(
    f"""
SELECT 
    X.player_name,
    {rename_cols},
    {"+".join(f"{k}*{v}" for k, v in weights.items())} as score,
    ROW_NUMBER() OVER (ORDER BY score DESC) as rank
FROM X 
ORDER BY score DESC
"""
).df()

Unnamed: 0,player_name,aspct_1st_serve,aspct_1st_serve_points_won,aspct_2nd_serve_points_won,aspct_service_games_won,asavg_aces_match,asavg_double_faultsmatch,score,rank
0,Nick Kyrgios,0.68,0.788,0.559,0.929,0.159,0.034,295.725,1
1,John Isner,0.684,0.807,0.535,0.917,0.224,0.027,294.497,2
2,Reilly Opelka,0.652,0.795,0.578,0.908,0.159,0.021,293.438,3
3,Hubert Hurkacz,0.634,0.782,0.553,0.905,0.133,0.016,287.517,4
4,Novak Djokovic,0.656,0.772,0.562,0.877,0.059,0.019,286.74,5
5,Maxime Cressy,0.616,0.783,0.544,0.888,0.145,0.083,283.162,6
6,Matteo Berrettini,0.64,0.783,0.516,0.878,0.124,0.021,281.803,7
7,Nicolas Jarry,0.645,0.812,0.49,0.87,0.124,0.023,281.801,8
8,Ben Shelton,0.586,0.79,0.543,0.895,0.1,0.048,281.452,9
9,Stefanos Tsitsipas,0.631,0.765,0.552,0.863,0.076,0.021,281.155,10


In [11]:
def test_ddb():
    rename_cols = ", ".join(
        [f"{col} as" + col.replace("serve__", "") for col in X.columns]
    )
    return db.sql(
        f"""
    SELECT 
        {rename_cols},
        {"+".join(f"{k}*{v}" for k, v in weights.items())} as score,
        ROW_NUMBER() OVER (ORDER BY score DESC) as rank
    FROM X 
    ORDER BY score DESC
    """
    )

In [12]:
from sharp.utils import scores_to_ordering


def test_pd():
    X_ = X.copy()
    X_.rename(
        columns={col: col.replace("serve__", "") for col in X_.columns}, inplace=True
    )
    X_["score"] = atp_score(X_.drop(columns=["player_name"]))
    X_.sort_values("score", ascending=False, inplace=True)
    X_["rank"] = scores_to_ordering(X_["score"])
    return X_

In [13]:
%timeit test_ddb()
%timeit test_pd()

1.43 ms ± 76.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.83 ms ± 28.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
