In [1]:
import pandas as pd
import plotly.express as px
import unimib_snowit_project.utils as u

# Load Data

In [2]:
root_dir_path = u.get_root_dir()
df_in_dir = "models"
data_dir_path = root_dir_path.joinpath(df_in_dir)


files = [
    'rfm_within_2024-2025.csv',
    'predicted_2024-2025_within_churn_df.csv',
    'predicted_sentiment.csv'
]

paths = {file_name.split('.')[0]: data_dir_path.joinpath(file_name) for file_name in files}

dfs = {}

for name, path in paths.items():
    dfs[name] = pd.read_csv(path)
    print(f"{name} loaded: {dfs[name].shape[0]} rows, {dfs[name].shape[1]} columns")


rfm= dfs['rfm_within_2024-2025']
churn_df = dfs['predicted_2024-2025_within_churn_df']
sentiment_df = dfs['predicted_sentiment']

rfm_within_2024-2025 loaded: 72070 rows, 9 columns
predicted_2024-2025_within_churn_df loaded: 62229 rows, 4 columns
predicted_sentiment loaded: 93429 rows, 2 columns


# Merge Data: RFM and Churn

We are here merging RFM data with predicted churners for the 2024/2025 season

In [3]:
merged_df = pd.merge(churn_df, rfm, on='user.uid', how='inner')

In [4]:
merged_df.shape

(61346, 12)

In [5]:
merged_df.columns

Index(['user.uid', 'will_be_churn?', 'churn_prob', 'churn_ground_truth',
       'Recency', 'Frequency', 'Monetary', 'R_score', 'F_score', 'M_score',
       'RF_class', 'RFM_Category'],
      dtype='object')

Plot

In [6]:
# TP, FP, TN, FN
def classify_case(row):
    if row["will_be_churn?"] == 1 and row["churn_ground_truth"] == 1:
        return "True Positive"
    elif row["will_be_churn?"] == 1 and row["churn_ground_truth"] == 0:
        return "False Positive"
    elif row["will_be_churn?"] == 0 and row["churn_ground_truth"] == 0:
        return "True Negative"
    elif row["will_be_churn?"] == 0 and row["churn_ground_truth"] == 1:
        return "False Negative"
    else:
        return "Other"

In [7]:
merged_df["churn_case"] = merged_df.apply(classify_case, axis=1)

# order
order = ["Cheap", "Tin", "Copper", "Bronze", "Silver", "Gold", "Diamond"]

# hist
fig = px.histogram(
    merged_df,
    x="RFM_Category",
    color="churn_case",
    category_orders={
        "RFM_Category": order,
        "churn_case": ["True Positive", "False Positive", "True Negative", "False Negative"]
    },
    barmode="group",  
    title="Churn Prediction Within Season 2024-2025 by RFM Category",
    labels={"RFM_Category": "Category", "count": "Count", "churn_case": "Outcome"}
)

fig.update_layout(bargap=0.05)
fig.show()

# counts
churn_case_counts = (
    merged_df
    .groupby(["RFM_Category", "churn_case"])
    .size()
    .unstack(fill_value=0)
    .reindex(order)
    .reset_index()
)

churn_case_counts

churn_case,RFM_Category,False Negative,False Positive,True Negative,True Positive
0,Cheap,1,0,0,10289
1,Tin,9,499,0,5694
2,Copper,30,1360,4,11941
3,Bronze,51,323,7,7006
4,Silver,111,1461,28,3984
5,Gold,461,3880,1044,5078
6,Diamond,953,0,0,7132


# Merge with Sentiment Data

In [8]:
merged_sentiment_df = pd.merge(merged_df, sentiment_df, left_on='user.uid', right_on='user_ids', how='left')

In [9]:
merged_sentiment_df.columns

Index(['user.uid', 'will_be_churn?', 'churn_prob', 'churn_ground_truth',
       'Recency', 'Frequency', 'Monetary', 'R_score', 'F_score', 'M_score',
       'RF_class', 'RFM_Category', 'churn_case', 'user_ids', 'y_test_pred'],
      dtype='object')

In [10]:
merged_sentiment_df.shape

(61346, 15)

In [11]:
# order
order = ["Cheap", "Tin", "Copper", "Bronze", "Silver", "Gold", "Diamond"]

# sentiment map
sentiment_map = {0: "Positive", 1: "Neutral", 2: "Negative"}
merged_sentiment_df["sentiment"] = merged_sentiment_df["y_test_pred"].map(sentiment_map)

# hist
fig = px.histogram(
    merged_sentiment_df,
    x="RFM_Category",
    color="sentiment",
    category_orders={
        "RFM_Category": order,
        "sentiment_case": ["Positive", "Neutral", "Negative"]
    },
    barmode="group",  
    title="Sentiment label by RFM Category",
    labels={"RFM_Category": "Category", "count": "Count", "sentiment": "Outcome"}
)

fig.update_layout(bargap=0.05)
fig.show()

# Expected Results & Cost-Benefit Balance

Load Order_Details Data

In [12]:
root_dir_path = u.get_root_dir()
df_in_dir = "data_loaded"
data_pkl_dir_path = root_dir_path.joinpath(df_in_dir)


pkl_files = [
    'order_details.pkl',
]

pkl_paths = {file_name.split('.')[0]: data_pkl_dir_path.joinpath(file_name) for file_name in pkl_files}

dfs = {}

for name, path in pkl_paths.items():
    dfs[name] = pd.read_pickle(path)
    print(f"{name} loaded: {dfs[name].shape[0]} rows, {dfs[name].shape[1]} columns")
  
order_details_df = dfs['order_details']

order_details loaded: 993037 rows, 15 columns


Analysis

In [13]:
# customer data (from notebook05 and notebook07)
n_active_clients = 62229
n_predicted_churners = 59526

# chun probability higher than a certain threshold & top RFM class
churn_threshold = 0.80
n_user_to_offer = merged_sentiment_df[(merged_sentiment_df['churn_prob']>churn_threshold) & ( (merged_sentiment_df['sentiment']=='Positive') | (merged_sentiment_df['sentiment']=='Neutral') ) & ((merged_sentiment_df['RFM_Category']=='Diamond') | (merged_sentiment_df['RFM_Category']=='Gold'))].shape[0] 

# model metrics
precision = 0.873618    
recall = 0.969789        
response_rate = 0.15  # business hypothesis
percentage_discount = 0.05

# average revenue per client
avg_revenue_per_client = merged_df[(merged_sentiment_df['RFM_Category']=='Diamond') | (merged_sentiment_df['RFM_Category']=='Gold')]['Monetary'].mean()  

# average discount per client
skipass_prices = order_details_df[(order_details_df["product.type"]=='skipass~return')|
                                  (order_details_df["product.type"]=='skipass')|
                                  (order_details_df["product.type"]=='skipass')|
                                  (order_details_df["product.type"]=='voucher~skipass')|
                                  (order_details_df["product.type"]=='skipass~dynamic')]["item.amount"]
avg_skipass_price = skipass_prices.mean()
avg_skipass_discount_value = percentage_discount*avg_skipass_price        

# retained
retained_clients = int(n_user_to_offer * response_rate)

# saved revenues (without considering discount)
retained_revenue = retained_clients * avg_revenue_per_client

# costs due to discount 
cost_discounts = retained_clients * avg_skipass_discount_value

# false positive costs
false_positives = int(n_user_to_offer * (1 - precision))
cost_fp = false_positives * avg_skipass_discount_value

# operational campaign cost per client
cost_operational_per_client = 2
cost_operational = n_user_to_offer * cost_operational_per_client

# Net Impact
net_impact = retained_revenue - (cost_discounts + cost_fp + cost_operational)

# results
results = {
    "Active clients": n_active_clients,
    "Predicted churners": n_predicted_churners,
    "Target clients": n_user_to_offer,
    "Retained clients": retained_clients,
    "Retained revenue (€)": retained_revenue,
    "Cost discounts (€)": cost_discounts,
    "Cost false positives (€)": cost_fp,
    "Operative costs (€)": cost_operational,
    "Net Impact (€)": net_impact
}

df_results = pd.DataFrame([results])
df_results

Unnamed: 0,Active clients,Predicted churners,Target clients,Retained clients,Retained revenue (€),Cost discounts (€),Cost false positives (€),Operative costs (€),Net Impact (€)
0,62229,59526,1486,222,148967.769271,659.753418,555.738281,2972,144780.277572
