In [6]:
import faker as f
import numpy as np

from numpy.random import default_rng
from pandas import DataFrame
from scipy.stats import truncnorm
from sklearn.metrics import average_precision_score

In [30]:
fake_bad_actor_generator = f.Faker()
rng = default_rng()


def generate_potential_bad_actor(position):
    """
    The details of this function don't really matter. It just generates a
    random 'potential bad actor' and labels it with whether it was actually
    determined to be fraudulent, and how likely the model though there were
    fraudulent.
    """
    return {
        "name": fake_bad_actor_generator.name(),
        "email": fake_bad_actor_generator.ascii_email(),
        "is_fraudulent": rng.binomial(1, (1000.0 - position) / 1000.0),
        "probability_of_fraudulent": round(
            truncnorm.rvs(a=0.0, b=(1000.0 - position) / 1000.0), 2
        ),
    }


ordered_fraud_list = [generate_potential_bad_actor(r) for r in np.arange(0, 1000)]
# Shuffle them so it's more realistic
ordered_fraud_df = DataFrame(ordered_fraud_list).sample(frac=1)

In [31]:
ordered_fraud_df

Unnamed: 0,name,email,is_fraudulent,probability_of_fraudulent
385,Christopher Cline,dwilliams@terry.com,0,0.19
690,Rachel Chan,katieshaw@hansen-jones.com,0,0.24
857,Melissa Bennett,john63@hotmail.com,0,0.12
683,Joshua Nelson,rdavies@mendez-scott.com,0,0.05
653,Margaret Wilson PhD,jacob83@weaver.com,1,0.10
...,...,...,...,...
372,Henry Huang,amy27@chase.com,1,0.05
673,Jennifer Leach,eric04@russell.com,0,0.33
451,Kendra Bradford,gomezjustin@hotmail.com,1,0.08
187,Jennifer Clark,kevin49@stewart.net,1,0.77


In [33]:
average_precision_score(
    y_true=ordered_fraud_df["is_fraudulent"],
    y_score=ordered_fraud_df["probability_of_fraudulent"],
)

0.6919819953431889

In [118]:
def precision_at_k(y_true, y_score, k, pos_label=1):
    from sklearn.utils import column_or_1d
    from sklearn.utils.multiclass import type_of_target
    
    y_true_type = type_of_target(y_true)
    if not (y_true_type == "binary"):
        raise ValueError("y_true must be a binary column.")
    
    # Makes this compatible with various array types
    y_true_arr = column_or_1d(y_true)
    y_score_arr = column_or_1d(y_score)
    
    y_true_arr = y_true_arr == pos_label
    
    desc_sort_order = np.argsort(y_score_arr)[::-1]
    y_true_sorted = y_true_arr[desc_sort_order]
    y_score_sorted = y_score_arr[desc_sort_order]
    
    true_positives = y_true_sorted[:k].sum()
    
    return true_positives / k

In [125]:
precision_at_k(
    y_true=ordered_fraud_df["is_fraudulent"],
    y_score=ordered_fraud_df["probability_of_fraudulent"],
    k=40
)

0.875