In [152]:
import pandas_gbq
import pandas as pd
import numpy as np

In [153]:
from sklearn.metrics import auc, confusion_matrix, precision_recall_curve, roc_auc_score

In [172]:
q='''
WITH
  cust_orders AS (
  SELECT
    analytical_customer_id,
    lifecycle_segment,
    global_entity_id,
    reordered
  FROM
    `fulfillment-dwh-production.cl_mkt._reorder_lifecycle_segmentation_history`
  LEFT JOIN (
    SELECT
      DISTINCT global_entity_id,
      analytical_customer_id,
      1 AS reordered,
    FROM
      `fulfillment-dwh-production.curated_data_shared_coredata_business.orders`
    WHERE
      partition_date_local = "2023-09-01"
      AND global_entity_id="FP_TH"
      AND is_successful
      AND analytical_customer_id IS NOT NULL )
  USING
    (analytical_customer_id,
      global_entity_id)
  WHERE computation_date = "2023-09-01" and global_entity_id="FP_TH" ),
  pred_scores AS (
  SELECT
    "general_reorder" AS model_type,
    global_entity_id,
    analytical_customer_id,
    1 - concated_survival_scores[ORDINAL(30)] AS reorder_score,
    scoring_date
  FROM
    `mkt-reorder-prod.mkt_reorder_prod.predictions_rsf_mature_targeted_ALL`
  WHERE
    scoring_date = "2023-09-01"
    AND global_entity_id="FP_TH" ),
  merged_table AS (
  SELECT
    p.global_entity_id,
    p.analytical_customer_id,
    COALESCE(c.reordered, 0) AS reordered,
    p.reorder_score*100 AS reorder_score,
    p.model_type,
    c.lifecycle_segment
  FROM
    pred_scores AS p
  LEFT JOIN (
    SELECT
      *
    FROM
      cust_orders ) AS c
  ON
    p.global_entity_id = c.global_entity_id
    AND p.analytical_customer_id = c.analytical_customer_id ),
  binned_table AS (
  SELECT
    *,
    CASE
      WHEN reorder_score < 10 THEN "[0,10)"
      WHEN reorder_score >= 10
    AND reorder_score < 20 THEN "[10,20)"
      WHEN reorder_score >= 20 AND reorder_score < 30 THEN "[20,30)"
      WHEN reorder_score >= 30
    AND reorder_score < 40 THEN "[30,40)"
      WHEN reorder_score >= 40 AND reorder_score < 50 THEN "[40,50)"
      WHEN reorder_score >= 50
    AND reorder_score < 60 THEN "[50,60)"
      WHEN reorder_score >= 60 AND reorder_score < 70 THEN "[60,70)"
      WHEN reorder_score >= 70
    AND reorder_score < 80 THEN "[70,80)"
      WHEN reorder_score >= 80 AND reorder_score < 90 THEN "[80,90)"
      WHEN reorder_score >= 90 THEN "[90,100)"
    ELSE
    "invalid_value"
  END
    AS reorder_bin,
  FROM
    merged_table ),
  min_max_reorder AS (
  SELECT
    global_entity_id,
    model_type,
    -- lifecycle_segment,
    MAX(reorder_score) AS max_reorder_score,
    MIN(reorder_score) AS min_reorder_score
  FROM
    binned_table
  GROUP BY
    model_type,
    global_entity_id--, lifecycle_segment
    )
SELECT
  r.global_entity_id,
  r.analytical_customer_id,
  r.reordered,
  r.reorder_score,
  r.model_type,
  r.reorder_bin,
  (r.reorder_score - m.min_reorder_score)/(m.max_reorder_score - m.min_reorder_score) AS reorder_score_scaled,
  r.lifecycle_segment
FROM
  binned_table AS r
JOIN
  min_max_reorder AS m
ON
  m.model_type = r.model_type
  AND m.global_entity_id = r.global_entity_id
  --AND m.lifecycle_segment = r.lifecycle_segment
ORDER BY
  r.global_entity_id DESC,
  r.analytical_customer_id DESC
'''

In [173]:
df = pandas_gbq.read_gbq(q)

Downloading: 100%|[32m██████████[0m|


In [174]:
df.tail()

Unnamed: 0,global_entity_id,analytical_customer_id,reordered,reorder_score,model_type,reorder_bin,reorder_score_scaled,lifecycle_segment
2135464,FP_TH,--0--h2aUA6-Z2rRG0q7Mw,0,48.0,general_reorder,"[40,50)",0.430233,dormant_early_customer
2135465,FP_TH,---W4mtDUEmZ2ZqJ4cN_1Q,1,36.0,general_reorder,"[30,40)",0.290698,infrequent_mature_customer
2135466,FP_TH,---VrpoIXLW82Kim2fVx7w,0,13.0,general_reorder,"[10,20)",0.023256,dormant_early_customer
2135467,FP_TH,---1oqCLVmicpzN-udXAAQ,0,18.0,general_reorder,"[10,20)",0.081395,dormant_mature_customer
2135468,FP_TH,----TV1zUBW00O6T2942TA,0,18.0,general_reorder,"[10,20)",0.081395,stale_early_customer


In [176]:
#df['orders_count'].fillna(0, inplace=True)
df['lifecycle_segment'].fillna("no_segment", inplace=True)

In [177]:
def model_evaluation_metrices(y_true, y_pred_binary, ypred_score):
    """
    This function gives model evaluation metrices viz. cm,accuracy,recall,specificity,auc,f1_score

        Parameters:
            y_true(array/list/series): true target labels
            y_pred_binary(array/list/series): prediction target label in binary labels(0/1)
            ypred_score(array/list/series): prediction target scores

        Returns:
            accuracy(float): accuracy of the model
            recall(float): recall of the model
            specificity(float): specificity of the model
            f1_score(float): f1_score of the model
            precision(float): precision of the model
            roc_auc(float): roc_auc of the model
            precision_recall_auc(float): precision_recall_auc of the model
    """
    cm = confusion_matrix(y_true, y_pred_binary)
    tn, fp, fn, tp = cm.ravel()
    # auc = roc_auc_score(y_true,y_pred)
    recall = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tn + fp + fn + tp)
    f1_score = (2 * tp) / (2 * tp + fp + fn)
    # calculate precision-recall curve
    precision_for_auc, recall_for_auc, thresholds_vals = precision_recall_curve(
        y_true, y_pred_binary
    )
    # calculate precision-recall AUC
    precision_recall_auc = auc(recall_for_auc, precision_for_auc)
    roc_auc = roc_auc_score(y_true=y_true, y_score=ypred_score)

    return (
        round(accuracy, 2),
        round(recall, 2),
        round(specificity, 2),
        round(f1_score, 2),
        round(precision, 2),
        round(roc_auc, 2),
        round(precision_recall_auc, 2),
    )

In [183]:
def make_results(df, order_col):
    df_store_final = pd.DataFrame(
            columns=[
                "threshold",
                "lifecycle_segment",
                "accuracy",
                "recall",
                "specificity",
                "f1_score",
                "precision",
                "roc_auc",
                "precision_recall_auc",
            ]
        )

    for orders in df[order_col].unique():
        df_orders = df[df[order_col]==orders]
        for mythres in [0.3, 0.5, 0.7]:
            binary_pred = df_orders["reorder_score_scaled"].apply(
                lambda x: 1 if x > mythres else 0
            )
            if (df_orders["reordered"].nunique() < 2) | (
                binary_pred.nunique() < 2 #understand condition better
            ):
                continue

            (
                accuracy,
                recall,
                specificity,
                f1_score,
                precision,
                roc_auc,
                precision_recall_auc,
            ) = model_evaluation_metrices(
                y_true=df_orders["reordered"].to_list(),
                y_pred_binary=binary_pred,
                ypred_score=df_orders["reorder_score_scaled"],
        )

            df_store = pd.DataFrame(
                                index=[0],
                                columns=[
                                    "threshold",
                                    "lifecycle_segment",
                                    "accuracy",
                                    "recall",
                                    "specificity",
                                    "f1_score",
                                    "precision",
                                    "roc_auc",
                                    "precision_recall_auc",
                                ],
                            )
            df_store["threshold"] = mythres
            df_store["lifecycle_segment"] = orders
            df_store["accuracy"] = accuracy
            df_store["recall"] = recall
            df_store["specificity"] = specificity
            df_store["f1_score"] = f1_score
            df_store["precision"] = precision
            df_store["roc_auc"] = roc_auc
            df_store["precision_recall_auc"] = precision_recall_auc
            df_store_final = pd.concat(
                [df_store_final, df_store], axis=0, ignore_index=True
            )
    return df_store_final

In [189]:
results = make_results(df, order_col='lifecycle_segment')
print(results)

    threshold           lifecycle_segment  accuracy  recall  specificity  \
0         0.3  infrequent_mature_customer      0.49    0.84         0.48   
1         0.5  infrequent_mature_customer      0.76    0.61         0.76   
2         0.7  infrequent_mature_customer      0.90    0.33         0.92   
3         0.3    frequent_mature_customer      0.31    0.98         0.23   
4         0.5    frequent_mature_customer      0.48    0.95         0.42   
5         0.7    frequent_mature_customer      0.63    0.87         0.60   
6         0.3     dormant_mature_customer      0.96    0.03         0.97   
7         0.3        stale_early_customer      0.89    0.50         0.90   
8         0.5        stale_early_customer      0.96    0.24         0.97   
9         0.7        stale_early_customer      0.99    0.01         1.00   
10        0.3       recent_early_customer      0.49    0.86         0.47   
11        0.5       recent_early_customer      0.85    0.44         0.87   
12        0.

Try left joining predictions on orders:

In [185]:
q1 = '''
WITH
  cust_orders AS (
  SELECT
    analytical_customer_id,
    lifecycle_segment,
    global_entity_id,
    reordered
  FROM
    `fulfillment-dwh-production.cl_mkt._reorder_lifecycle_segmentation_history`
  LEFT JOIN (
    SELECT
      DISTINCT global_entity_id,
      analytical_customer_id,
      1 AS reordered,
    FROM
      `fulfillment-dwh-production.curated_data_shared_coredata_business.orders`
    WHERE
      partition_date_local = "2023-09-01"
      AND global_entity_id="FP_TH"
      AND is_successful
      AND analytical_customer_id IS NOT NULL )
  USING
    (analytical_customer_id,
      global_entity_id)
  WHERE computation_date = "2023-09-01" and global_entity_id="FP_TH" ),
  pred_scores AS (
  SELECT
    "general_reorder" AS model_type,
    global_entity_id,
    analytical_customer_id,
    1 - concated_survival_scores[ORDINAL(30)] AS reorder_score,
    scoring_date
  FROM
    `mkt-reorder-prod.mkt_reorder_prod.predictions_rsf_mature_targeted_ALL`
  WHERE
    scoring_date = "2023-09-01"
    AND global_entity_id="FP_TH" ),
  merged_table AS (


  SELECT
    p.global_entity_id,
    p.analytical_customer_id,
    COALESCE(c.reordered, 0) AS reordered,
    p.reorder_score*100 AS reorder_score,
    p.model_type,
    c.lifecycle_segment
  FROM
     cust_orders AS c
  LEFT JOIN (
    SELECT
      global_entity_id,
    analytical_customer_id,
    reorder_score*100 AS reorder_score,
    model_type,
    FROM
      pred_scores ) AS p
  ON
    p.global_entity_id = c.global_entity_id
    AND p.analytical_customer_id = c.analytical_customer_id



    ),
  binned_table AS (
  SELECT
    *,
    CASE
      WHEN reorder_score < 10 THEN "[0,10)"
      WHEN reorder_score >= 10
    AND reorder_score < 20 THEN "[10,20)"
      WHEN reorder_score >= 20 AND reorder_score < 30 THEN "[20,30)"
      WHEN reorder_score >= 30
    AND reorder_score < 40 THEN "[30,40)"
      WHEN reorder_score >= 40 AND reorder_score < 50 THEN "[40,50)"
      WHEN reorder_score >= 50
    AND reorder_score < 60 THEN "[50,60)"
      WHEN reorder_score >= 60 AND reorder_score < 70 THEN "[60,70)"
      WHEN reorder_score >= 70
    AND reorder_score < 80 THEN "[70,80)"
      WHEN reorder_score >= 80 AND reorder_score < 90 THEN "[80,90)"
      WHEN reorder_score >= 90 THEN "[90,100)"
    ELSE
    "invalid_value"
  END
    AS reorder_bin,
  FROM
    merged_table ),
  min_max_reorder AS (
  SELECT
    global_entity_id,
    model_type,
    -- lifecycle_segment,
    MAX(reorder_score) AS max_reorder_score,
    MIN(reorder_score) AS min_reorder_score
  FROM
    binned_table
  GROUP BY
    model_type,
    global_entity_id--, lifecycle_segment
    )
SELECT
  r.global_entity_id,
  r.analytical_customer_id,
  r.reordered,
  r.reorder_score,
  r.model_type,
  r.reorder_bin,
  (r.reorder_score - m.min_reorder_score)/(m.max_reorder_score - m.min_reorder_score) AS reorder_score_scaled,
  r.lifecycle_segment
FROM
  binned_table AS r
JOIN
  min_max_reorder AS m
ON
  m.model_type = r.model_type
  AND m.global_entity_id = r.global_entity_id
  --AND m.lifecycle_segment = r.lifecycle_segment
ORDER BY
  r.global_entity_id DESC,
  r.analytical_customer_id DESC
'''

In [186]:
df1 = pandas_gbq.read_gbq(q1)

Downloading: 100%|[32m██████████[0m|


In [187]:
df1['lifecycle_segment'].fillna("no_segment", inplace=True)

In [190]:
results1 = make_results(df1, order_col='lifecycle_segment')
print(results1)

    threshold           lifecycle_segment  accuracy  recall  specificity  \
0         0.3  infrequent_mature_customer      0.49    0.84         0.48   
1         0.5  infrequent_mature_customer      0.76    0.61         0.76   
2         0.7  infrequent_mature_customer      0.90    0.33         0.92   
3         0.3    frequent_mature_customer      0.31    0.98         0.23   
4         0.5    frequent_mature_customer      0.48    0.95         0.42   
5         0.7    frequent_mature_customer      0.63    0.87         0.60   
6         0.3     dormant_mature_customer      0.96    0.03         0.97   
7         0.3        stale_early_customer      0.89    0.50         0.90   
8         0.5        stale_early_customer      0.96    0.24         0.97   
9         0.7        stale_early_customer      0.99    0.01         1.00   
10        0.3       recent_early_customer      0.49    0.86         0.47   
11        0.5       recent_early_customer      0.85    0.44         0.87   
12        0.

The df are the same