# Модель для классификации для N дней

In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
from category_encoders import TargetEncoder
from sklearn.model_selection import TimeSeriesSplit

In [3]:
df = pd.read_csv("../dataframe/prepared_data_for_predict_next_purchase.csv")

In [5]:
df.columns

Index(['user_id', 'countPoints', 'Item', 'Date', 'Price', 'Fromat_Offline',
       'Format_Trade_In', 'sertificate_6_months', 'Ag_800', 'Ag_925', 'Au_375',
       'Au_585', 'Other_materials', 'Steal_0', 'Jewelry', 'IsHoliday', 'Month',
       'Weekday', 'Quarter', 'Month_sin', 'Month_cos', 'Weekday_sin',
       'Weekday_cos', 'Quarter_sin', 'Quarter_cos', 'NextPurchaseDate',
       'DaysUntilNextPurchase', 'Purchase_in_next_7d', 'Purchase_in_next_14d',
       'Purchase_in_next_30d', 'Purchase_in_next_60d', 'Purchase_in_next_90d',
       'Price_Segment', 'avg_user_item_interval', 'avg_price', 'total_points',
       'purchase_count', 'DaysSinceLastPurchase', 'Item_popularity',
       'Days_Since_Last_Purchase', 'Average_Check', 'Price_Preference',
       'Category', 'Is_Favorite_Category', 'Category_freq', 'User_Item_Count',
       'days_since_last_purchase', 'days_since_last_purchase_avg',
       'days_since_last_purchase_max', 'days_since_last_purchase_min'],
      dtype='object')

In [33]:
df = df.dropna(subset=['DaysUntilNextPurchase'])

In [34]:
df["Date"] = pd.to_datetime(df["Date"])

In [28]:
print(df["DaysUntilNextPurchase"].describe())
print((df["DaysUntilNextPurchase"] == 0).mean())

count    21451.000000
mean        26.587478
std         72.125469
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max       1027.000000
Name: DaysUntilNextPurchase, dtype: float64
0.7288704489301198


In [None]:
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

In [None]:
target_cols = [
    "Purchase_in_next_7d",
    "Purchase_in_next_14d",
    "Purchase_in_next_30d",
    "Purchase_in_next_60d",
    "Purchase_in_next_90d",
]
days_intervals = np.array([7, 14, 30, 60, 90])

In [None]:
cat_cols = ["Item", "Category", "Price_Segment", "Price_Preference"]
bin_cols = [
    "Is_Favorite_Category",
    "Fromat_Offline",
    "Format_Trade_In",
    "sertificate_6_months",
    "Ag_800",
    "Ag_925",
    "Au_375",
    "Au_585",
    "Other_materials",
    "Steal_0",
    "Jewelry",
    "IsHoliday",
]
num_cols = [
    "countPoints",
    "Price",
    "avg_user_item_interval",
    "avg_price",
    "total_points",
    "purchase_count",
    "DaysSinceLastPurchase",
    "Days_Since_Last_Purchase",
    "Average_Check",
    "User_Item_Count",
    "days_since_last_purchase",
    "days_since_last_purchase_avg",
    "days_since_last_purchase_max",
    "days_since_last_purchase_min",
    "Item_popularity",
]
season_cols = [
    "Month_sin",
    "Month_cos",
    "Weekday_sin",
    "Weekday_cos",
    "Quarter_sin",
    "Quarter_cos",
]

features = num_cols + bin_cols + season_cols + cat_cols

In [None]:
X = df[features].copy()
y = df[target_cols].copy()

In [None]:
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
precision_folds = {col: [] for col in target_cols}

for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()

    X_train_enc = X_train.copy()
    X_test_enc = X_test.copy()
    for col in cat_cols:
        te = TargetEncoder(cols=[col])
        te.fit(X_train[[col]], y_train.mean(axis=1))
        X_train_enc[col] = te.transform(X_train[[col]])
        X_test_enc[col] = te.transform(X_test[[col]])

    for col in target_cols:
        model = xgb.XGBClassifier(
            n_estimators=400,
            learning_rate=0.05,
            max_depth=6,
            random_state=42,
            eval_metric="logloss",
            tree_method="hist",
            use_label_encoder=False,
        )
        model.fit(X_train_enc, y_train[col])
        pred_probs = model.predict_proba(X_test_enc)[:, 1]

        # Precision@5
        top5 = (
            pd.Series(pred_probs, index=X_test.index)
            .sort_values(ascending=False)
            .head(5)
        )
        precision = y_test.loc[top5.index, col].mean()
        precision_folds[col].append(precision)

In [None]:
print("Средний Precision@5 по фолдам:")
for col in target_cols:
    print(f"{col}: {np.mean(precision_folds[col]):.3f}")

In [None]:
X_enc = X.copy()
for col in cat_cols:
    te = TargetEncoder(cols=[col])
    te.fit(X[[col]], y.mean(axis=1))
    X_enc[col] = te.transform(X[[col]])

pred_probs_all = pd.DataFrame(index=df.index)
pred_days_all = []

threshold = 0.5
for col_idx, col in enumerate(target_cols):
    model = xgb.XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        random_state=42,
        eval_metric="logloss",
        tree_method="hist",
        use_label_encoder=False,
    )
    model.fit(X_enc, y[col])
    pred_probs_all[col] = model.predict_proba(X_enc)[:, 1]

In [None]:
for idx, row in pred_probs_all.iterrows():
    day = None
    for j, col in enumerate(target_cols):
        if row[col] >= threshold:
            day = days_intervals[j]
            break
    if day is None:
        day = 90
    pred_days_all.append(day)

In [None]:
df["Pred_DaysUntilNextPurchase"] = pred_days_all
df["Pred_NextPurchaseDate"] = df["Date"] + pd.to_timedelta(
    df["Pred_DaysUntilNextPurchase"], unit="D"
)

comparison_df = pd.DataFrame(
    {
        "user_id": df["user_id"],
        "Pred_DaysUntilNextPurchase": df["Pred_DaysUntilNextPurchase"],
        "Pred_NextPurchaseDate": df["Pred_NextPurchaseDate"],
    }
)

In [None]:
for col in target_cols:
    comparison_df[f"true_{col}"] = y[col].values
    comparison_df[f"pred_{col}"] = pred_probs_all[col].values

print("\nПример сравнения реальных и предсказанных значений:")
comparison_df.head(10)