In [None]:
import pandas_gbq
import pandas as pd
from sklearn.metrics import auc, confusion_matrix, precision_recall_curve, roc_auc_score

In [None]:
q='''
# 1-select users and respective segments for "2023-09-01"
# 2-mark which users placed an order on "2023-09-01"
with recency AS
(
SELECT      global_entity_id,
            analytical_customer_id,
            DATE("2023-09-01") AS scoring_date,
            DATE_DIFF(DATE("2023-09-01"), MAX(placed_at_local), DAY) AS last_order_recency
FROM        `fulfillment-dwh-production.curated_data_shared_central_dwh.orders`
WHERE       is_sent IS TRUE
            AND analytical_customer_id IS NOT NULL
            AND global_entity_id ="YS_TR"
            AND placed_at_local BETWEEN DATE_SUB(DATE("2023-09-01"), INTERVAL 30 DAY) AND "2023-09-01"
GROUP BY    global_entity_id,
            analytical_customer_id
),


segments AS (SELECT  global_entity_id,
        analytical_customer_id,
        CASE
            WHEN last_order_recency >= 25 THEN "1_er"
            WHEN last_order_recency >= 10 AND last_order_recency <25  THEN "low frequent"
            WHEN last_order_recency >= 7 AND last_order_recency <10  THEN "mid frequent"
            WHEN last_order_recency <= 7  THEN "frequent"
        END AS lifecycle_segment
FROM    recency
WHERE   last_order_recency <= 30
),

  cust_orders AS (
  SELECT analytical_customer_id, lifecycle_segment, global_entity_id, reordered
  FROM segments
  LEFT JOIN (SELECT DISTINCT global_entity_id, analytical_customer_id, 1 AS reordered
             FROM  `fulfillment-dwh-production.curated_data_shared_coredata_business.orders`
                WHERE partition_date_local = "2023-09-01" AND global_entity_id="YS_TR" AND is_successful AND analytical_customer_id IS NOT NULL )
  USING(analytical_customer_id, global_entity_id)
  WHERE snapshot_date = "2023-09-01" and global_entity_id="YS_TR" ),

# select reorder score predicted for users on "2023-09-01"
  pred_scores AS (
  SELECT
    "general_reorder" AS model_type, global_entity_id, analytical_customer_id, 1 - concated_survival_scores[ORDINAL(1)] AS reorder_score, scoring_date
  FROM `mkt-reorder-prod.mkt_reorder_prod.predictions_rsf_mature_targeted_ALL`
    WHERE scoring_date = "2023-09-01" AND global_entity_id="YS_TR"),

# merge previous two tables
# (when left joining pred_scores on cust_orders we lose the customers without segment. Otherwise the results are the same)
  merged_table AS (
  SELECT p.global_entity_id, p.analytical_customer_id, COALESCE(c.reordered, 0) AS reordered, p.reorder_score*100 AS reorder_score, p.model_type, c.lifecycle_segment
  FROM pred_scores AS p
  LEFT JOIN ( SELECT * FROM cust_orders ) AS c
  ON p.global_entity_id = c.global_entity_id AND p.analytical_customer_id = c.analytical_customer_id),

# calculate min/max reorder score to scale reorder probability
  min_max_reorder AS (
  SELECT global_entity_id, model_type, MAX(reorder_score) AS max_reorder_score, MIN(reorder_score) AS min_reorder_score
  FROM merged_table GROUP BY  model_type, global_entity_id)

# final results with scaled reorder probability, lifecycle segment and reorder score
SELECT
  r.global_entity_id,
  r.analytical_customer_id,
  r.reordered,
  r.reorder_score,
  r.model_type,
  (r.reorder_score - m.min_reorder_score)/(m.max_reorder_score - m.min_reorder_score) AS reorder_score_scaled,
  r.lifecycle_segment
FROM merged_table AS r
JOIN min_max_reorder AS m ON m.model_type = r.model_type AND m.global_entity_id = r.global_entity_id
ORDER BY r.global_entity_id DESC, r.analytical_customer_id DESC
'''

In [None]:
df = pandas_gbq.read_gbq(q)

In [None]:
df.head()

In [None]:
#df['orders_count'].fillna(0, inplace=True)
df['lifecycle_segment'].fillna("no_segment", inplace=True)

The following functions to calculate the model performance can be found at https://github.com/deliveryhero/datahub-airflow/blob/main/dags/mkt/mkt_reorder_performance_pipeline.py

In [None]:
def model_evaluation_metrices(y_true, y_pred_binary, ypred_score):
    cm = confusion_matrix(y_true, y_pred_binary)
    tn, fp, fn, tp = cm.ravel()
    # auc = roc_auc_score(y_true,y_pred)
    recall = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tn + fp + fn + tp)
    f1_score = (2 * tp) / (2 * tp + fp + fn)
    # calculate precision-recall curve
    precision_for_auc, recall_for_auc, thresholds_vals = precision_recall_curve(
        y_true, y_pred_binary
    )
    # calculate precision-recall AUC
    precision_recall_auc = auc(recall_for_auc, precision_for_auc)
    roc_auc = roc_auc_score(y_true=y_true, y_score=ypred_score)

    return (
        round(accuracy, 2),
        round(recall, 2),
        round(specificity, 2),
        round(f1_score, 2),
        round(precision, 2),
        round(roc_auc, 2),
        round(precision_recall_auc, 2),
    )

def make_results(df, filter_col):
    df_store_final = pd.DataFrame(
            columns=[
                "threshold",
                "lifecycle_segment",
                "accuracy",
                "recall",
                "specificity",
                "f1_score",
                "precision",
                "roc_auc",
                "precision_recall_auc",
            ]
        )

    for segment in df[filter_col].unique():
        df_filtered = df[df[filter_col]==segment]
        for mythres in [0.3, 0.5, 0.7]:
            binary_pred = df_filtered["reorder_score_scaled"].apply(
                lambda x: 1 if x > mythres else 0
            )
            if (df_filtered["reordered"].nunique() < 2) | (
                binary_pred.nunique() < 2
            ):
                continue

            (
                accuracy,
                recall,
                specificity,
                f1_score,
                precision,
                roc_auc,
                precision_recall_auc,
            ) = model_evaluation_metrices(
                y_true=df_filtered["reordered"].to_list(),
                y_pred_binary=binary_pred,
                ypred_score=df_filtered["reorder_score_scaled"],
        )

            df_store = pd.DataFrame(
                                index=[0],
                                columns=[
                                    "threshold",
                                    "lifecycle_segment",
                                    "accuracy",
                                    "recall",
                                    "specificity",
                                    "f1_score",
                                    "precision",
                                    "roc_auc",
                                    "precision_recall_auc",
                                ],
                            )
            df_store["threshold"] = mythres
            df_store["lifecycle_segment"] = segment
            df_store["accuracy"] = accuracy
            df_store["recall"] = recall
            df_store["specificity"] = specificity
            df_store["f1_score"] = f1_score
            df_store["precision"] = precision
            df_store["roc_auc"] = roc_auc
            df_store["precision_recall_auc"] = precision_recall_auc
            df_store_final = pd.concat(
                [df_store_final, df_store], axis=0, ignore_index=True
            )
    return df_store_final

In [None]:
results = make_results(df, filter_col='lifecycle_segment')
print(results)

In [None]:
df['lifecycle_segment'].unique()