In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, f1_score, precision_score, roc_auc_score

TARGET = "alive"
PRED = "pred_alive"


def calc_destribution(data: dict) -> dict:
    data_sum = sum(data.values())
    distribution = {}
    for key, value in data.items():
        distribution[key] = f"{round(value / data_sum, 3) * 100:.1f}%"
    return distribution

def data_balancing_by_col(data_any: pd.DataFrame, targ_col: str = TARGET) -> pd.DataFrame:
    val_cnt = data_any[targ_col].value_counts()
    minimal_counts = val_cnt.min()
    cols = val_cnt.index.to_list()
    balanced_data_dfs = []
    for col_val in cols: balanced_data_dfs.append(data_any[data_any[targ_col] == col_val].sample(minimal_counts))
    return pd.concat(balanced_data_dfs)

def calc_metrics(data_any: pd.DataFrame, target: str = TARGET, predicted: str = PRED) -> tuple[float]:
    recall = recall_score(data_any[target], data_any[predicted])
    f1 = f1_score(data_any[target], data_any[predicted])
    precision = precision_score(data_any[target], data_any[predicted])
    roc_auc = roc_auc_score(data_any[target], data_any[predicted])
    print(f"recall: {recall:.3f}, f1: {f1:.3f}, precision: {precision:.3f}, roc_auc: {roc_auc:.3f}")
    return (recall, f1, precision, roc_auc)

def calc_predictions_by_days(data_rfm: pd.DataFrame, days_for_die: int = 10) -> pd.DataFrame:
    local_data = data_rfm.copy()
    last_data_date = local_data['first_buy'].max()
    local_data[PRED] = local_data['last_buy'].apply(lambda x: (last_data_date - x).days < days_for_die)
    return local_data

In [50]:
rfm_train_data_180_days = pd.read_parquet("data/rfm.train.left.180.right.95.parquet.gzip")
balanced_rfm_train_data_180_days = data_balancing_by_col(rfm_train_data_180_days)

for day in range(30,91):
    print(f"Days: {day}:", end=' ')
    calc_metrics(calc_predictions_by_days(data_rfm=balanced_rfm_train_data_180_days, days_for_die=day))

Days: 30: recall: 0.745, f1: 0.757, precision: 0.768, roc_auc: 0.760
Days: 31: recall: 0.748, f1: 0.757, precision: 0.766, roc_auc: 0.760
Days: 32: recall: 0.750, f1: 0.758, precision: 0.766, roc_auc: 0.760
Days: 33: recall: 0.753, f1: 0.758, precision: 0.763, roc_auc: 0.760
Days: 34: recall: 0.755, f1: 0.759, precision: 0.762, roc_auc: 0.760
Days: 35: recall: 0.757, f1: 0.759, precision: 0.760, roc_auc: 0.759
Days: 36: recall: 0.759, f1: 0.759, precision: 0.759, roc_auc: 0.759
Days: 37: recall: 0.761, f1: 0.760, precision: 0.758, roc_auc: 0.759
Days: 38: recall: 0.762, f1: 0.760, precision: 0.758, roc_auc: 0.759
Days: 39: recall: 0.765, f1: 0.760, precision: 0.756, roc_auc: 0.759
Days: 40: recall: 0.775, f1: 0.763, precision: 0.751, roc_auc: 0.759
Days: 41: recall: 0.788, f1: 0.767, precision: 0.747, roc_auc: 0.760
Days: 42: recall: 0.797, f1: 0.769, precision: 0.742, roc_auc: 0.760
Days: 43: recall: 0.807, f1: 0.770, precision: 0.737, roc_auc: 0.759
Days: 44: recall: 0.808, f1: 0.770

Если человек 46 дней не совершает покупки, то значит уже не вернется

**recall: 0.819** <br/>
**f1: 0.771** <br/>
**precision: 0.729** <br/>
**roc_auc: 0.757** <br/>

In [86]:
from btyd.utils import calculate_alive_path, expected_cumulative_transactions
import pickle

def calc_alive_path_val(partner):
    t = rfm["T"].max()
    transactions = data[data["partner"]==partner]
    datetime_col = "rep_date"
    freq="D"

    customer_history = transactions[[datetime_col]].copy()
    customer_history.index = pd.DatetimeIndex(customer_history[datetime_col])
    customer_history["transactions"] = 1
    customer_history = customer_history.resample(freq).sum()
    path = calculate_alive_path(model, transactions, datetime_col, t, freq)
    return path.iloc[-1][0]

data = pd.read_parquet("data/raw.train.left.180.right.95.parquet.gzip")
rfm = pd.read_parquet("data/rfm.train.left.180.right.95.parquet.gzip")
model = pickle.load(open("model/beta.geo.model.180.days.part.pkl", "rb"))

persons_data = set(data["partner"])
persons_rfm = set(rfm.index.to_list())
persons_intersected = persons_data.intersection(persons_rfm)
persons_intersected_first_100 = list(persons_intersected)[:100]

print(f"Diff: {len(persons_rfm.difference(persons_data))}. Rfm: {len(persons_rfm)}, Raw: {len(persons_data)}")

rfm_100 = rfm.loc[persons_intersected_first_100]
rfm_100 = rfm_100.reset_index()
rfm_100["alive_prob"] = rfm_100['partner'].apply(lambda p: calc_alive_path_val(partner=p))

Diff: 29320. Rfm: 347497, Raw: 444541


In [88]:
rfm_100[PRED] = rfm_100["alive_prob"].apply(lambda x: x >= .5)
calc_metrics(rfm_100)

rfm_100[PRED] = rfm_100["alive_prob"].apply(lambda x: x >= .3)
calc_metrics(rfm_100)

rfm_100[PRED] = rfm_100["alive_prob"].apply(lambda x: x >= .1)
calc_metrics(rfm_100)

recall: 0.163, f1: 0.252, precision: 0.565, roc_auc: 0.331
recall: 0.237, f1: 0.333, precision: 0.559, roc_auc: 0.244
recall: 0.450, f1: 0.550, precision: 0.706, roc_auc: 0.350


(0.45, 0.549618320610687, 0.7058823529411765, 0.35)

In [106]:
model = pickle.load(open("model/beta.geo.model.180.days.part.pkl", "rb"))
rfm = pd.read_parquet("data/rfm.train.left.180.right.95.parquet.gzip")
rfm = rfm.reset_index()

rfm["pursaches"] = rfm.apply(lambda row: model.conditional_expected_number_of_purchases_up_to_time(
    row.loc["partner"], row.loc["frequency"], row.loc["recency"], row.loc["T"]
), axis = 1)

In [109]:
rfm[PRED] = rfm["pursaches"].apply(lambda x: x > 0)
calc_metrics(rfm_100)

rfm[PRED] = rfm["pursaches"].apply(lambda x: x > 3)
calc_metrics(rfm_100)

recall: 0.450, f1: 0.550, precision: 0.706, roc_auc: 0.350
recall: 0.450, f1: 0.550, precision: 0.706, roc_auc: 0.350


(0.45, 0.549618320610687, 0.7058823529411765, 0.35)

In [143]:
from sklearn.linear_model import LogisticRegression

def rfm_prepare(data_rfm: pd.DataFrame) -> pd.DataFrame:
    local_data = data_rfm.copy()
    local_data["first_buy_last_buy_delta_days"] = local_data["last_buy"] - local_data["first_buy"] 
    local_data["first_buy_last_buy_delta_days"] = local_data["first_buy_last_buy_delta_days"].apply(lambda x: x.days)
    local_data = data_rfm.drop(["first_buy", "last_buy"], axis=1)
    return local_data

rfm = pd.read_parquet("data/rfm.train.left.180.right.95.parquet.gzip")

rfm = rfm_prepare(rfm)
print(f"Из {rfm.shape[0]} строк {rfm[rfm.duplicated()].shape[0]} - дубликаты")
rfm = rfm.drop_duplicates()
y = rfm[TARGET]
X = rfm.drop(TARGET, axis=1)

model = LogisticRegression()
model.fit(X, y)

Из 347497 строк 21529 - дубликаты


In [149]:
rfm_test = pd.read_parquet("data/rfm.test.left.180.right.95.parquet.gzip")

rfm_test = rfm_prepare(rfm_test)
predictions = model.predict(rfm_test.drop(TARGET, axis=1))
rfm_test[PRED] = predictions

calc_metrics(rfm_test)

recall: 0.894, f1: 0.848, precision: 0.807, roc_auc: 0.716


(0.8937113098162524,
 0.8479833672080404,
 0.8067071115604988,
 0.7157919965680777)

In [148]:
rfm_test = pd.read_parquet("data/rfm.test.left.180.right.95.parquet.gzip")

rfm_test = data_balancing_by_col(rfm_prepare(rfm_test))
predictions = model.predict(rfm_test.drop(TARGET, axis=1))
rfm_test[PRED] = predictions

calc_metrics(rfm_test)

recall: 0.894, f1: 0.759, precision: 0.659, roc_auc: 0.716


(0.8939522456423088, 0.7588472477229362, 0.6592181391712275, 0.715912464481106)