# Read Data, Settings and Parameters

In [20]:
import pandas as pd
from bson import ObjectId
from pymongo import MongoClient
from datetime import datetime, timedelta, timezone
import numpy as np

In [21]:
# for Querying MongoDB
beginning_date = datetime(2024, 9, 21, 20, 30)
finishing_date = datetime.now()
module_name = "OnlineShopping"

In [22]:
# Set Parameters Before Run
start_train_date_filter = pd.to_datetime('2024-09-22')
window_table_in_months = 3

In [23]:
# pipeline = [
#     {
#         "$match": {
#             "$and": [
#                 {"created_at": {"$gte": beginning_date, '$lt': finishing_date}},
#                 {"status": "finished"},
#                 {
#                     "$or": [
#                         {"deleted_at": {"$exists": False}},
#                         {"deleted_at": {"$exists": True, "$eq": None}},
#                     ]
#                 },
#                 {"module": module_name},
#             ]
#         }
#     },
#     {
#         "$project": {
#             "_id": 1,
#             "user_id": 1,
#             "module": 1,
#             "type": 1,
#             "product_name": {
#                 "$cond": {
#                     "if": {"$eq": ["$module", "Giftcard"]},
#                     "then": {
#                         "$concat": [
#                             {"$ifNull": ["$product.name", ""]},
#                             " ",
#                             {"$toString": {"$ifNull": ["$product.price", ""]}},
#                             " ",
#                             {"$ifNull": ["$product.country_name", ""]},
#                         ]
#                     },
#                     "else": "$product_name",
#                 }
#             },
#             "total_payment_price": "$initial_total",
#             "product_price": 1,
#             "qty": 1,
#             "date": {
#                 "$dateToString": {
#                     "format": "%Y-%m-%d",
#                     "date": {"$add": ["$created_at", 3.5 * 60 * 60 * 1000]},
#                 }
#             },
#             "client_type": 1,
#             "module_unit_price": 1,
#         }
#     },
# ]
#
# result = db.orders.aggregate(pipeline)
#
# df_orders = pd.DataFrame(result)

In [24]:
# pipeline = [
#     {
#         "$match": {
#             "$and": [
#                 {"status": "finished"},
#                 {
#                     "$or": [
#                         {"deleted_at": {"$exists": False}},
#                         {"deleted_at": {"$exists": True, "$eq": None}},
#                     ]
#                 },
#                 {"module": module_name}
#             ]
#         },
#
#     },
#     {
#         "$project": {
#             "_id": 0,
#             "user_id": 1,
#             "created_at": 1
#         }
#     },
# ]
#
# result = db.orders.aggregate(pipeline)
# df_min_order_date = pd.DataFrame(result)
#
# df_min_order_date = \
#     (df_min_order_date
#      .assign(date=lambda _df: _df["created_at"] + pd.Timedelta(hours=3, minutes=30))
#      .groupby(["user_id"]).agg(
#         firstdate=("date", "min")).reset_index()
#      .assign(firstdate=lambda _df: _df['firstdate'].dt.normalize())
#      .assign(user_id=lambda _df: _df['user_id'].astype('str'))
#      )

In [25]:
df_orders = pd.read_parquet('df_orders.parquet')
df_min_order_date = pd.read_parquet('df_min_order_date.parquet')
df_dim_date = pd.read_parquet("dimdate5.parquet") # to attach shamsi date
rfm_labels = pd.read_excel("OnlineShoppingLabelsUpdate.xlsx", engine='openpyxl') # RFM Label for any modules

In [26]:
# Define cluster categories

low_valued_clusters_labelFa = (
    rfm_labels.loc[rfm_labels['LowOrHigh'] == 'low', 'clusters_labelFa']
    .dropna()
    .unique()
    .tolist()
)

medium_valued_clusters_labelFa = (
    rfm_labels.loc[rfm_labels['LowOrHigh'] == 'medium', 'clusters_labelFa']
    .dropna()
    .unique()
    .tolist()
)


high_valued_clusters_labelFa = (
    rfm_labels.loc[rfm_labels['LowOrHigh'] == 'high', 'clusters_labelFa']
    .dropna()
    .unique()
    .tolist()
)

In [27]:

# add shamsi date to df_orders
df_min_order_date = \
    (df_min_order_date
     .assign(date=lambda _df: _df["created_at"] + pd.Timedelta(hours=3, minutes=30))
     .groupby(["user_id"]).agg(
        firstdate=("date", "min")).reset_index()
     .assign(firstdate=lambda _df: _df['firstdate'].dt.normalize())
     .assign(user_id=lambda _df: _df['user_id'].astype('str'))
     )

# add ShamsiDate to dim_date_join
df_dim_date_join = (
    df_dim_date
    .assign(miladi_d=lambda df: pd.to_datetime(df['miladi_d'], errors='coerce'))
    [['miladi_d', 'jalali_1_s']]
    .assign(
        ShamsiDate=lambda df: pd.to_numeric(
            df['jalali_1_s'].astype(str).str.replace('/', ''),
            errors='coerce'
        )
    )
    .rename(columns={'miladi_d': 'date'})
    .drop(columns=['jalali_1_s'])
    [['date', 'ShamsiDate']]
    .assign(date=lambda df: pd.to_datetime(df["date"]).dt.normalize())
)

In [28]:
# add shamsi first date to df_min_date
df_min_order_date_clean = \
    (df_min_order_date
     .assign(firstdate=lambda df: pd.to_datetime(df['firstdate']).dt.normalize())
     .merge(df_dim_date_join, left_on="firstdate", right_on="date", how='left')
     .rename(columns={'ShamsiDate': 'shamsifirstdate'})
     .drop(columns=['date'])
     )
# add ShamsiMonth column to df_raw_data
ShamsiMonthRef = [int(f"{year}{month:02}") for year in range(1401, 1410) for month in range(1, 13)]
df_orders = \
    (df_orders
     .assign(date=lambda df: pd.to_datetime(df['date'], errors='coerce'))
     .rename(columns={"_id": "factor_id"})
     .assign(factor_id=lambda df: df['factor_id'].apply(lambda x: str(x) if isinstance(x, ObjectId) else x))
     .drop(columns=["module"])
     .loc[lambda df: df['date'] >= start_train_date_filter]
     .merge(df_dim_date_join, on="date", how="left")
     .assign(ShamsiMonth=lambda df: df['ShamsiDate'].astype(str).str[:6].astype(int))
     )

In [29]:
# Determine min and max ShamsiMonth for filtering
min_shamsi_month = df_orders['ShamsiMonth'].min()

# Filter df_orders to include only rows with ShamsiMonth within the specified window
if int(f"{min_shamsi_month}01") not in df_orders['ShamsiDate'].values:
    idx = ShamsiMonthRef.index(min_shamsi_month)
    min_shamsi_month = ShamsiMonthRef[idx + 1]


max_shamsi_month = df_orders['ShamsiMonth'].max()
idx = ShamsiMonthRef.index(max_shamsi_month)
max_shamsi_month = ShamsiMonthRef[idx - 1]


df_orders = (
    df_orders
    .loc[lambda df: df["ShamsiMonth"] >= min_shamsi_month]
    .loc[lambda df: df["ShamsiMonth"] <= max_shamsi_month]
    .merge(df_min_order_date_clean, on="user_id", how="left")
)

In [30]:
df_orders_maxshamsidate = df_orders.groupby("ShamsiMonth")["ShamsiDate"].max().reset_index().rename(columns={"ShamsiDate": "max_shamsi_date"})

In [31]:
# create df_orders_with_period
df_orders_with_period = (
    pd.concat(
        [
            df_orders.loc[df_orders["ShamsiMonth"].isin(
                ShamsiMonthRef[idx: idx + window_table_in_months])].assign(
                period=(0 if idx == ShamsiMonthRef.index(min_shamsi_month) else idx - ShamsiMonthRef.index(
                    min_shamsi_month))
            )
            for idx in range(ShamsiMonthRef.index(min_shamsi_month),
                             len(ShamsiMonthRef) - window_table_in_months + 1)
            if (
            (lambda wd: len(wd) > 0 and wd["ShamsiMonth"].nunique() == window_table_in_months)(
                df_orders.loc[
                    df_orders["ShamsiMonth"].isin(ShamsiMonthRef[idx: idx + window_table_in_months])]
            )
        )
        ],
        ignore_index=True
    )
)

# calculate average days between factors index
df_orders_with_period = (
    df_orders_with_period
    .sort_values(by=["user_id", "period", "date"])
    .assign(
        days_diff=lambda df: df.groupby(["user_id", "period"])["date"].diff().dt.days
    )
    .assign(
        avg_days_between_factors=lambda df: df.groupby(["user_id", "period"])["days_diff"].transform("mean")
    )
)

In [32]:
# make a copy of rfm_labels as rfm_labels_main to keep main labels
rfm_labels_main = (
    rfm_labels
    .drop_duplicates()
    .rename(columns={'clusters_labelFa': 'clusters_labelFa_main'})
)

In [33]:
# add NewUserLabelAcc to df_raw_data_with_period for new users but new users labeled
period_ranges = (
    df_orders_with_period
    .groupby("period")["ShamsiDate"]
    .agg(["min", "max"])
    .reset_index()
    .rename(columns= {"min": "min_date", "max": "max_date"})
)

In [34]:
# make new column NewUserLabelAcc
df_orders_with_period = (
    df_orders_with_period
    .merge(
        df_orders_with_period
        .groupby("period")["ShamsiDate"]
        .agg(["min", "max"])
        .reset_index()
        .rename(columns={"min": "min_date", "max": "max_date"}),
        on="period", how="left"
    )
    .assign(
        NewUserLabelAcc=lambda df: (
                (df["shamsifirstdate"] >= df["min_date"]) &
                (df["shamsifirstdate"] <= df["max_date"])
        ).astype(int)
    )
)


# RFM Function & Score Table

In [35]:
# Function to calculate RFM for each monthly period
def calc_rfm_monthly(period, df_orders):
    data_window = df_orders[df_orders["period"] == period].copy()

    if data_window.empty:
        return pd.DataFrame()

    grouped = (
        data_window.groupby("user_id", as_index=False)
        .agg(
            max_date=('date', 'max'),
            F=('factor_id', pd.Series.nunique),
            M=('total_payment_price', 'sum')
        )
    )

    end_date = data_window["date"].max()
    grouped["R"] = (end_date - grouped["max_date"]).dt.days + 1

    # grouped["R_Score_raw"] = pd.qcut(grouped["R"], q=5, labels=False, duplicates='drop') + 1

    # grouped['R_Score_raw'] = pd.cut(
    # grouped['R'],
    # bins=[0, 15, 30, 45, 60, float('inf')],
    # labels=[1, 2, 3, 4, 5],
    # right=True
    # ).astype(int)


    grouped['F_Score'] = pd.cut(
    grouped['F'],
    bins=[0, 1, 3, 10, 30, float('inf')],
    labels=[1, 2, 3, 4, 5],
    right=True
    ).astype(int)


    grouped["R_Score_raw"] = pd.qcut(grouped["R"], q=5, labels=False, duplicates='drop') + 1
    grouped["R_Score"] = 6 - grouped["R_Score_raw"]
    # grouped["F_Score"] = pd.qcut(grouped["F"], q=10, labels=False, duplicates='drop') + 1
    grouped["M_Score"] = pd.qcut(grouped["M"], q=5, labels=False, duplicates='drop') + 1

    grouped["period"] = period
    grouped["start_month"] = data_window["ShamsiMonth"].min()
    grouped["end_month"] = data_window["ShamsiMonth"].max()

    grouped = grouped.drop(columns=["R_Score_raw"])

    return grouped

# Rule 1 & 2 : Define ("New" + clusters_labelFA) and ("Passed" + clusters_labelFa)

In [36]:
# calc df_raw_data_with_period_NULAcc column
df_orders_with_period_NULAcc = (
    df_orders_with_period
    [['user_id', 'period', 'shamsifirstdate', 'NewUserLabelAcc']]
    .drop_duplicates()
)

In [37]:
low_high_to_new_label = {
    "low": "مشتریان بازگشته کم ارزش",
    "medium": "مشتریان بازگشته فعال",
    "high": "مشتریان بازگشته ارزشمند",
}

label_map_old = (
    rfm_labels[["clusters_labelFa", "LowOrHigh"]]
    .dropna()
    .drop_duplicates()
    .assign(
        new_label=lambda x: x["LowOrHigh"].map(low_high_to_new_label)
    )
    .set_index("clusters_labelFa")["new_label"]
    .to_dict()
)


In [38]:
# Map for Passed customers (Rul
# Map for new customers (Rule 3)
# These are users where NewUserLabelAcc == 1 in their first period
# I use "تازه" temporary and will change to "جدید" later
label_map_new = {
    "مشتریان جدید": "مشتریان جدید",
    "مشتریان جدید با پتانسیل رشد": "مشتریان جدید با پتانسیل رشد",
    "مشتریان جدید در معرض خطر" : "مشتریان جدید در معرض خطر",
    "مشتریان جدید یا جدید ارزشمند و در معرض خطر" : "مشتریان جدید یا جدید ارزشمند و در معرض خطر",
    "مشتریان جدید و ارزشمند در آستانه ریزش" : "مشتریان جدید و ارزشمند در آستانه ریزش",
    "مشتریان ارزشمند یا پتانسیل ارزشمند و در معرض خطر" : "مشتریان تازه و ارزشمند یا پتانسیل ارزشمند و در معرض خطر",
    "مشتریان ارزشمند" : "مشتریان تازه و ارزشمند",
    "مشتریان کم ارزش" : "مشتریان تازه و کم ارزش",
    "مشتریان ارزشمند در آستانه ریزش" : "مشتریان تازه و ارزشمند در آستانه ریزش",
    "مشتریان پتانسیل ارزشمند" : "مشتریان تازه و پتانسیل ارزشمند"
}