In [132]:
import pandas_gbq
import pandas as pd
import numpy as np

In [133]:
from sklearn.metrics import auc, confusion_matrix, precision_recall_curve, roc_auc_score

In [134]:
q='''
WITH cust_orders AS (
    SELECT DISTINCT
      global_entity_id,
      analytical_customer_id,
      1 AS reordered,
      orders_count,
      lifecycle_segment
    FROM `fulfillment-dwh-production.curated_data_shared_coredata_business.orders`
    LEFT JOIN (SELECT analytical_customer_id, COUNT(analytical_customer_id) as orders_count FROM `fulfillment-dwh-production.curated_data_shared_coredata_business.orders`
    WHERE partition_date_local between "2023-08-24" and "2023-08-31" and global_entity_id="FP_TH" GROUP BY analytical_customer_id)USING(analytical_customer_id)
    LEFT JOIN (SELECT analytical_customer_id, lifecycle_segment FROM `fulfillment-dwh-production.cl_mkt._reorder_lifecycle_segmentation_history` WHERE computation_date = "2023-09-01" and global_entity_id="FP_TH")USING(analytical_customer_id)
    WHERE partition_date_local = "2023-09-01" and global_entity_id="FP_TH"
      AND is_successful
      AND analytical_customer_id IS NOT NULL
      order by analytical_customer_id
  ),


  pred_scores AS (
    SELECT
      "general_reorder" AS model_type,
      global_entity_id,
      analytical_customer_id,
      1 - concated_survival_scores[ORDINAL(30)] AS reorder_score,
      scoring_date
    FROM `mkt-reorder-prod.mkt_reorder_prod.predictions_rsf_mature_targeted_ALL`
    WHERE scoring_date = "2023-09-01" and global_entity_id="FP_TH"
  ),


  merged_table AS (
  SELECT
    p.global_entity_id,
    p.analytical_customer_id,
    COALESCE(c.reordered, 0) AS reordered,
    p.reorder_score*100 AS reorder_score,
    p.model_type,
    c.orders_count,
    c.lifecycle_segment
  FROM pred_scores AS p
  LEFT JOIN (
    SELECT
      *
    FROM cust_orders
  ) AS c
    ON p.global_entity_id = c.global_entity_id
      AND p.analytical_customer_id = c.analytical_customer_id
  ),

  binned_table AS (
    SELECT
      *,
      CASE WHEN reorder_score < 10 THEN "[0,10)"
          WHEN reorder_score >= 10 AND reorder_score < 20 THEN "[10,20)"
          WHEN reorder_score >= 20 AND reorder_score < 30 THEN "[20,30)"
          WHEN reorder_score >= 30 AND reorder_score < 40 THEN "[30,40)"
          WHEN reorder_score >= 40 AND reorder_score < 50 THEN "[40,50)"
          WHEN reorder_score >= 50 AND reorder_score < 60 THEN "[50,60)"
          WHEN reorder_score >= 60 AND reorder_score < 70 THEN "[60,70)"
          WHEN reorder_score >= 70 AND reorder_score < 80 THEN "[70,80)"
          WHEN reorder_score >= 80 AND reorder_score < 90 THEN "[80,90)"
          WHEN reorder_score >= 90 THEN "[90,100)"
        ELSE "invalid_value" END AS reorder_bin,
    FROM merged_table
  ),

  min_max_reorder AS (
    SELECT
      global_entity_id,
      model_type,
      max(reorder_score) AS max_reorder_score,
      min(reorder_score) AS min_reorder_score
    FROM binned_table
    GROUP BY model_type, global_entity_id
  )

  SELECT
    r.global_entity_id,
    r.analytical_customer_id,
    r.reordered,
    r.reorder_score,
    r.model_type,
    r.reorder_bin,
    SAFE_DIVIDE((r.reorder_score - m.min_reorder_score),(m.max_reorder_score - m.min_reorder_score)) AS reorder_score_scaled,
    r.orders_count,
    r.lifecycle_segment
  FROM binned_table AS r
  JOIN min_max_reorder AS m
    ON m.model_type = r.model_type
    AND m.global_entity_id = r.global_entity_id
  ORDER BY r.global_entity_id DESC, r.analytical_customer_id DESC
'''

In [135]:
df = pandas_gbq.read_gbq(q)

Downloading: 100%|[32m██████████[0m|


In [136]:
df.tail()

Unnamed: 0,global_entity_id,analytical_customer_id,reordered,reorder_score,model_type,reorder_bin,reorder_score_scaled,orders_count,lifecycle_segment
2135464,FP_TH,--0--h2aUA6-Z2rRG0q7Mw,0,48.0,general_reorder,"[40,50)",0.430233,,
2135465,FP_TH,---W4mtDUEmZ2ZqJ4cN_1Q,1,36.0,general_reorder,"[30,40)",0.290698,,infrequent_mature_customer
2135466,FP_TH,---VrpoIXLW82Kim2fVx7w,0,13.0,general_reorder,"[10,20)",0.023256,,
2135467,FP_TH,---1oqCLVmicpzN-udXAAQ,0,18.0,general_reorder,"[10,20)",0.081395,,
2135468,FP_TH,----TV1zUBW00O6T2942TA,0,18.0,general_reorder,"[10,20)",0.081395,,


In [137]:
df['orders_count'].fillna(0, inplace=True)
df['lifecycle_segment'].fillna("no_segment", inplace=True)

In [138]:
def model_evaluation_metrices(y_true, y_pred_binary, ypred_score):
    """
    This function gives model evaluation metrices viz. cm,accuracy,recall,specificity,auc,f1_score

        Parameters:
            y_true(array/list/series): true target labels
            y_pred_binary(array/list/series): prediction target label in binary labels(0/1)
            ypred_score(array/list/series): prediction target scores

        Returns:
            accuracy(float): accuracy of the model
            recall(float): recall of the model
            specificity(float): specificity of the model
            f1_score(float): f1_score of the model
            precision(float): precision of the model
            roc_auc(float): roc_auc of the model
            precision_recall_auc(float): precision_recall_auc of the model
    """
    cm = confusion_matrix(y_true, y_pred_binary)
    tn, fp, fn, tp = cm.ravel()
    # auc = roc_auc_score(y_true,y_pred)
    recall = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tn + fp + fn + tp)
    f1_score = (2 * tp) / (2 * tp + fp + fn)
    # calculate precision-recall curve
    precision_for_auc, recall_for_auc, thresholds_vals = precision_recall_curve(
        y_true, y_pred_binary
    )
    # calculate precision-recall AUC
    precision_recall_auc = auc(recall_for_auc, precision_for_auc)
    roc_auc = roc_auc_score(y_true=y_true, y_score=ypred_score)

    return (
        round(accuracy, 2),
        round(recall, 2),
        round(specificity, 2),
        round(f1_score, 2),
        round(precision, 2),
        round(roc_auc, 2),
        round(precision_recall_auc, 2),
    )

In [139]:
def make_results(df, order_col):
    df_store_final = pd.DataFrame(
            columns=[
                "threshold",
                "orders_count",
                "accuracy",
                "recall",
                "specificity",
                "f1_score",
                "precision",
                "roc_auc",
                "precision_recall_auc",
            ]
        )

    for orders in df[order_col].unique():
        df_orders = df[df[order_col]==orders]
        for mythres in [0.3, 0.5, 0.7]:
            binary_pred = df_orders["reorder_score_scaled"].apply(
                lambda x: 1 if x > mythres else 0
            )
            if (df_orders["reordered"].nunique() < 2) | (
                binary_pred.nunique() < 2 #understand condition better
            ):
                continue

            (
                accuracy,
                recall,
                specificity,
                f1_score,
                precision,
                roc_auc,
                precision_recall_auc,
            ) = model_evaluation_metrices(
                y_true=df_orders["reordered"].to_list(),
                y_pred_binary=binary_pred,
                ypred_score=df_orders["reorder_score_scaled"],
        )

            df_store = pd.DataFrame(
                                index=[0],
                                columns=[
                                    "threshold",
                                    "orders_count",
                                    "accuracy",
                                    "recall",
                                    "specificity",
                                    "f1_score",
                                    "precision",
                                    "roc_auc",
                                    "precision_recall_auc",
                                ],
                            )
            df_store["threshold"] = mythres
            df_store["orders_count"] = orders
            df_store["accuracy"] = accuracy
            df_store["recall"] = recall
            df_store["specificity"] = specificity
            df_store["f1_score"] = f1_score
            df_store["precision"] = precision
            df_store["roc_auc"] = roc_auc
            df_store["precision_recall_auc"] = precision_recall_auc
            df_store_final = pd.concat(
                [df_store_final, df_store], axis=0, ignore_index=True
            )
    return df_store_final

In [140]:
make_results(df, order_col='orders_count')

Unnamed: 0,threshold,orders_count,accuracy,recall,specificity,f1_score,precision,roc_auc,precision_recall_auc
0,0.3,0,0.61,0.71,0.61,0.04,0.02,0.69,0.37
1,0.5,0,0.76,0.47,0.76,0.05,0.02,0.69,0.25
2,0.7,0,0.85,0.22,0.86,0.04,0.02,0.69,0.12


For people that ordered at least once in the past week the model performs perfectly, while for those that didn't order it does not.

In [141]:
df['order_bins'] = np.where(df['orders_count']<1, "less than 1", "more than 1")

In [142]:
make_results(df, order_col='order_bins')

Unnamed: 0,threshold,orders_count,accuracy,recall,specificity,f1_score,precision,roc_auc,precision_recall_auc
0,0.3,less than 1,0.61,0.71,0.61,0.04,0.02,0.69,0.37
1,0.5,less than 1,0.76,0.47,0.76,0.05,0.02,0.69,0.25
2,0.7,less than 1,0.85,0.22,0.86,0.04,0.02,0.69,0.12


In [143]:
make_results(df, order_col='lifecycle_segment')

Unnamed: 0,threshold,orders_count,accuracy,recall,specificity,f1_score,precision,roc_auc,precision_recall_auc
0,0.3,no_segment,0.61,0.62,0.61,0.0,0.0,0.66,0.31
1,0.5,no_segment,0.76,0.38,0.76,0.0,0.0,0.66,0.19
2,0.7,no_segment,0.86,0.07,0.86,0.0,0.0,0.66,0.04


In [144]:
df['lifecycle_segment'].unique()

array(['no_segment', 'frequent_mature_customer', 'stale_early_customer',
       'infrequent_mature_customer', 'dormant_early_customer',
       'recent_early_customer', 'dormant_mature_customer'], dtype=object)

In [145]:
df[(df['lifecycle_segment']=='infrequent_mature_customer') & (df['reordered']==0)]

Unnamed: 0,global_entity_id,analytical_customer_id,reordered,reorder_score,model_type,reorder_bin,reorder_score_scaled,orders_count,lifecycle_segment,order_bins


In [150]:
def make_results_classic(df_orders):
    df_store_final = pd.DataFrame(
            columns=[
                "threshold",
                "accuracy",
                "recall",
                "specificity",
                "f1_score",
                "precision",
                "roc_auc",
                "precision_recall_auc",
            ]
        )

    for mythres in [0.3, 0.5, 0.7]:
        binary_pred = df_orders["reorder_score_scaled"].apply(
            lambda x: 1 if x > mythres else 0
        )
        if (df_orders["reordered"].nunique() < 2) | (
            binary_pred.nunique() < 2 #understand condition better
        ):
            continue

        (
            accuracy,
            recall,
            specificity,
            f1_score,
            precision,
            roc_auc,
            precision_recall_auc,
        ) = model_evaluation_metrices(
            y_true=df_orders["reordered"].to_list(),
            y_pred_binary=binary_pred,
            ypred_score=df_orders["reorder_score_scaled"],
    )

        df_store = pd.DataFrame(
                            index=[0],
                            columns=[
                                "threshold",
                                "accuracy",
                                "recall",
                                "specificity",
                                "f1_score",
                                "precision",
                                "roc_auc",
                                "precision_recall_auc",
                            ],
                        )
        df_store["threshold"] = mythres
        df_store["accuracy"] = accuracy
        df_store["recall"] = recall
        df_store["specificity"] = specificity
        df_store["f1_score"] = f1_score
        df_store["precision"] = precision
        df_store["roc_auc"] = roc_auc
        df_store["precision_recall_auc"] = precision_recall_auc
        df_store_final = pd.concat(
            [df_store_final, df_store], axis=0, ignore_index=True
        )
    return df_store_final

In [151]:
make_results_classic(df)

Unnamed: 0,threshold,accuracy,recall,specificity,f1_score,precision,roc_auc,precision_recall_auc
0,0.3,0.63,0.93,0.61,0.19,0.11,0.89,0.52
1,0.5,0.77,0.85,0.76,0.26,0.15,0.89,0.51
2,0.7,0.85,0.74,0.86,0.33,0.21,0.89,0.48
