In [29]:
import numpy as np
import pandas as pd
import cloudpickle
from pandas.core.series import Series
from sklearn.metrics import recall_score, f1_score, precision_score, roc_auc_score, accuracy_score

from consts import TEST_PATH

In [30]:
def calc_score(target: Series, predicted: Series) -> None:
    recall = recall_score(target, predicted)
    f1 = f1_score(target, predicted)
    precision = precision_score(target, predicted)
    roc_auc = roc_auc_score(target, predicted)
    acc = accuracy_score(target, predicted)

    # Надо разобраться с gini
    sorted_data = np.sort(predicted)
    cumulative_sum = np.cumsum(sorted_data)
    lorenz_curve = cumulative_sum / cumulative_sum[-1]
    n = len(predicted)
    perfect_eq = np.linspace(0, 1, n)
    perfect_eq_area = np.trapz(perfect_eq, dx=1/n)
    lorenz_area = np.trapz(lorenz_curve, dx=1/n)
    gini_coeff = (perfect_eq_area - lorenz_area) / perfect_eq_area

    print(
        f"recall: {recall:.3f}\nf1: {f1:.3f}\nprecision: {precision:.3f}\nroc_auc: {roc_auc:.3f}\naccuracy: {acc:.3f}\nGini coeff: {gini_coeff:.3f}"
    )

In [31]:
df = pd.read_parquet(TEST_PATH)
bgf = cloudpickle.load(open("./model/beta.get.model.all.but.without.90.days.pkl", "rb"))
future_purchases_matrix = bgf.conditional_expected_number_of_purchases_up_to_time(
    90, df["frequency"], df["recency"], df["T"]
)
print(future_purchases_matrix)

partner
1149800    1.654763e-08
1149801    3.312361e-40
1149802    2.917833e-02
1149803    6.906924e-08
1149804    2.301588e-01
               ...     
1695379    8.596810e-02
1695380    2.141096e-02
1695381    4.215628e-01
1695382    1.508117e-17
1695383    7.502881e-27
Length: 487722, dtype: float64


In [32]:
df.head()

Unnamed: 0_level_0,monetary_value,first_buy,last_buy,count,alive,frequency,recency,T
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1149800,1.811364e-10,2017-12-22,2018-05-29,9,False,8,158,1889
1149801,2.443281e-11,2018-01-10,2020-12-22,181,False,180,1077,1870
1149802,2.173049e-10,2018-03-14,2018-03-14,1,False,0,0,1807
1149803,4.207549e-12,2017-11-24,2018-02-27,7,False,6,95,1917
1149804,1.309592e-13,2022-07-29,2022-07-29,1,False,0,0,209


In [33]:
df["pred_alive"] = future_purchases_matrix > 0.5
df.head()

Unnamed: 0_level_0,monetary_value,first_buy,last_buy,count,alive,frequency,recency,T,pred_alive
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1149800,1.811364e-10,2017-12-22,2018-05-29,9,False,8,158,1889,False
1149801,2.443281e-11,2018-01-10,2020-12-22,181,False,180,1077,1870,False
1149802,2.173049e-10,2018-03-14,2018-03-14,1,False,0,0,1807,False
1149803,4.207549e-12,2017-11-24,2018-02-27,7,False,6,95,1917,False
1149804,1.309592e-13,2022-07-29,2022-07-29,1,False,0,0,209,False


In [34]:
# Не сбалансированы
calc_score(df["alive"], df["pred_alive"])

recall: 0.532
f1: 0.644
precision: 0.815
roc_auc: 0.727
accuracy: 0.768
Gini coeff: 0.742


In [35]:
df["alive"].value_counts()

False    295515
True     192207
Name: alive, dtype: int64

In [36]:
n = 192207
df_true = df[df["alive"]==True].sample(n)
df_false = df[df["alive"]==False].sample(n)
df_new = pd.concat([df_true, df_false])
print(df_new['alive'].value_counts())

True     192207
False    192207
Name: alive, dtype: int64


In [37]:
df_new.head()

Unnamed: 0_level_0,monetary_value,first_buy,last_buy,count,alive,frequency,recency,T,pred_alive
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1222541,3.810268e-11,2018-11-13,2022-11-23,148,True,147,1471,1563,False
1222381,1.377133e-11,2018-10-04,2022-11-21,67,True,66,1509,1603,True
1193291,3.09588e-10,2018-06-19,2022-11-18,107,True,106,1613,1710,True
1581425,7.180154e-12,2022-06-29,2022-11-15,7,True,6,139,239,True
1580349,2.639935e-10,2022-06-09,2022-11-15,9,True,8,159,259,True


In [38]:
bgf = cloudpickle.load(open("./model/beta.get.model.all.but.without.90.days.pkl", "rb"))
future_purchases_matrix_new = bgf.conditional_expected_number_of_purchases_up_to_time(
    90, df_new["frequency"], df_new["recency"], df_new["T"]
)
print(future_purchases_matrix_new)

partner
1222541    1.772227e-01
1222381    2.178266e+00
1193291    1.076265e+00
1581425    1.152026e+00
1580349    1.039051e+00
               ...     
1346641    2.161486e-03
1272598    1.221070e-02
1426353    2.043995e-04
1287636    8.629804e-12
1296594    6.054569e-02
Length: 384414, dtype: float64


In [39]:
df_new["pred_alive"] = future_purchases_matrix_new > 0.5
df_new.head()

Unnamed: 0_level_0,monetary_value,first_buy,last_buy,count,alive,frequency,recency,T,pred_alive
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1222541,3.810268e-11,2018-11-13,2022-11-23,148,True,147,1471,1563,False
1222381,1.377133e-11,2018-10-04,2022-11-21,67,True,66,1509,1603,True
1193291,3.09588e-10,2018-06-19,2022-11-18,107,True,106,1613,1710,True
1581425,7.180154e-12,2022-06-29,2022-11-15,7,True,6,139,239,True
1580349,2.639935e-10,2022-06-09,2022-11-15,9,True,8,159,259,True


In [40]:
calc_score(df_new["alive"], df_new["pred_alive"])

recall: 0.532
f1: 0.661
precision: 0.871
roc_auc: 0.727
accuracy: 0.727
Gini coeff: 0.694
