In [4]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from category_encoders import TargetEncoder
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [5]:
df = pd.read_csv("dataframe/prepared_data_for_predict_next_purchase.csv")

In [83]:
df.columns

Index(['user_id', 'countPoints', 'Item', 'Date', 'Price', 'Fromat_Offline',
       'Format_Trade_In', 'sertificate_6_months', 'Ag_800', 'Ag_925', 'Au_375',
       'Au_585', 'Other_materials', 'Steal_0', 'Jewelry', 'IsHoliday', 'Month',
       'Weekday', 'Quarter', 'Month_sin', 'Month_cos', 'Weekday_sin',
       'Weekday_cos', 'Quarter_sin', 'Quarter_cos', 'NextPurchaseDate',
       'DaysUntilNextPurchase', 'Purchase_in_next_7d', 'Purchase_in_next_14d',
       'Purchase_in_next_30d', 'Purchase_in_next_60d', 'Purchase_in_next_90d',
       'Price_Segment', 'avg_user_item_interval', 'avg_price', 'total_points',
       'purchase_count', 'DaysSinceLastPurchase', 'Item_popularity',
       'Days_Since_Last_Purchase', 'Average_Check', 'Price_Preference',
       'Category', 'Is_Favorite_Category', 'Category_freq', 'User_Item_Count',
       'days_since_last_purchase', 'days_since_last_purchase_avg',
       'days_since_last_purchase_max', 'days_since_last_purchase_min'],
      dtype='object')

In [55]:
df["Date"] = pd.to_datetime(df["Date"])
df = df.dropna(subset=["Date"])

start_date = pd.to_datetime("2025-10-01")
end_date = pd.to_datetime("2025-10-10")

period_df = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)]

print("Количество покупок в период:", len(period_df))

unique_users = period_df["user_id"].nunique()
print("Количество уникальных пользователей с покупкой в период:", unique_users)

print("Пользователи:", period_df["user_id"].unique())

Количество покупок в период: 632
Количество уникальных пользователей с покупкой в период: 386
Пользователи: ['USER_101036' 'USER_101607' 'USER_102762' 'USER_103199' 'USER_107361'
 'USER_108620' 'USER_109748' 'USER_113133' 'USER_113999' 'USER_117617'
 'USER_118907' 'USER_119220' 'USER_124353' 'USER_125382' 'USER_134524'
 'USER_134976' 'USER_137798' 'USER_139406' 'USER_143216' 'USER_144025'
 'USER_144890' 'USER_153607' 'USER_156997' 'USER_161851' 'USER_164926'
 'USER_166943' 'USER_169066' 'USER_169491' 'USER_170537' 'USER_171309'
 'USER_172781' 'USER_174447' 'USER_175558' 'USER_176082' 'USER_176135'
 'USER_178771' 'USER_180118' 'USER_189307' 'USER_189396' 'USER_189885'
 'USER_191917' 'USER_195107' 'USER_196104' 'USER_197455' 'USER_199478'
 'USER_200512' 'USER_202074' 'USER_202401' 'USER_214887' 'USER_215764'
 'USER_216954' 'USER_217272' 'USER_219410' 'USER_219720' 'USER_221017'
 'USER_224771' 'USER_225121' 'USER_226075' 'USER_226314' 'USER_226707'
 'USER_226769' 'USER_227683' 'USER_22938

## Проверка на уже известных данных, для проверки правильности работы модели

In [None]:
df["Date"] = pd.to_datetime(df["Date"])
df = df.dropna(subset=["Date"])

In [None]:
train_end = pd.Timestamp("2025-06-30")
test_start = train_end + pd.Timedelta(days=1)
test_end = pd.Timestamp("2025-09-30")

In [None]:
train_df = df[df["Date"] <= train_end].copy()
test_df = df[(df["Date"] >= test_start) & (df["Date"] <= test_end)].copy()

In [None]:
target_users = test_df["user_id"].unique()
print(f"Количество пользователей с покупкой в тестовом периоде: {len(target_users)}")

In [None]:
num_cols = [
    "countPoints",
    "Price",
    "DaysUntilNextPurchase",
    "avg_user_item_interval",
    "avg_price",
    "total_points",
    "purchase_count",
    "DaysSinceLastPurchase",
    "Item_popularity",
    "Average_Check",
    "Category_freq",
    "User_Item_Count",
    "days_since_last_purchase_avg",
    "days_since_last_purchase_max",
    "days_since_last_purchase_min",
]

cat_cols = ["Item", "Category", "Is_Favorite_Category", "Price_Segment"]

In [None]:
user_features = train_df.groupby("user_id")[num_cols].mean().reset_index()
for c in cat_cols:
    user_features[c] = train_df.groupby("user_id")[c].last().values

user_features["target"] = user_features["user_id"].isin(target_users).astype(int)

In [None]:
X = user_features[num_cols + cat_cols]
y = user_features["target"]

te = TargetEncoder(cols=cat_cols)
X[cat_cols] = te.fit_transform(X[cat_cols], y)

In [None]:
tscv = TimeSeriesSplit(n_splits=3)
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f"\n===== Fold {fold+1} =====")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "seed": 42,
        "scale_pos_weight": scale_pos_weight,
    }

    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_val],
        num_boost_round=300,
    )

    y_pred_val = model.predict(X_val)
    df_val = pd.DataFrame(
        {
            "user_id": user_features.iloc[val_idx]["user_id"],
            "pred_prob": y_pred_val,
            "target": y_val,
        }
    )

In [None]:
def precision_at_k(df, k=5):
    return df.head(k)["target"].sum() / k

def hit_rate_at_k(df, k=10):
    return 1.0 if df.head(k)["target"].sum() > 0 else 0.0

In [None]:
df_val = df_val.sort_values("pred_prob", ascending=False).reset_index(drop=True)
p5 = precision_at_k(df_val, 5)
p10 = precision_at_k(df_val, 10)
hr10 = hit_rate_at_k(df_val, 10)

print(f"Precision@5: {p5:.3f}")
print(f"Precision@10: {p10:.3f}")
print(f"HitRate@10: {hr10:.3f}")

final_model = lgb.train(params, lgb.Dataset(X, label=y), num_boost_round=300)
user_features["pred_prob"] = final_model.predict(X)

df_eval = user_features[["user_id", "pred_prob", "target"]].copy()
df_eval = df_eval.sort_values("pred_prob", ascending=False).reset_index(drop=True)

p5 = df_eval.head(5)["target"].sum() / 5
p10 = df_eval.head(10)["target"].sum() / 10
hr10 = 1.0 if df_eval.head(10)["target"].sum() > 0 else 0.0

print("\n===== Финальные метрики =====")
print(f"Precision@5: {p5:.3f}")
print(f"Precision@10: {p10:.3f}")
print(f"HitRate@10: {hr10:.3f}")

print("\nТоп-20 клиентов с наибольшей вероятностью покупки:")
print(df_eval.head(20))

Количество пользователей с покупкой в тестовом периоде: 5141

===== Fold 1 =====
[LightGBM] [Info] Number of positive: 266, number of negative: 3155
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2818
[LightGBM] [Info] Number of data points in the train set: 3421, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.077755 -> initscore=-2.473247
[LightGBM] [Info] Start training from score -2.473247


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_cols] = te.fit_transform(X[cat_cols], y)


Precision@5: 1.000
Precision@10: 1.000
HitRate@10: 1.000

===== Fold 2 =====
[LightGBM] [Info] Number of positive: 541, number of negative: 6300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3194
[LightGBM] [Info] Number of data points in the train set: 6841, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079082 -> initscore=-2.454886
[LightGBM] [Info] Start training from score -2.454886
Precision@5: 1.000
Precision@10: 1.000
HitRate@10: 1.000

===== Fold 3 =====
[LightGBM] [Info] Number of positive: 785, number of negative: 9476
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3227
[LightGBM] [Info] Number of data points in the train set: 10261, number of used features: 19

## Предикты с 11 октября по сегодшний день (26 октября)

In [7]:
df = pd.read_csv("dataframe/prepared_data_for_predict_next_purchase.csv")

In [8]:
df["Date"] = pd.to_datetime(df["Date"])
df = df.dropna(subset=["Date"])

In [9]:
train_end = pd.Timestamp("2025-06-30")
test_start = train_end + pd.Timedelta(days=1)
test_end = pd.Timestamp("2025-09-30")

In [10]:
train_df = df[df["Date"] <= train_end].copy()
test_df = df[(df["Date"] >= test_start) & (df["Date"] <= test_end)].copy()

target_users = test_df["user_id"].unique()
print(f"Количество пользователей с покупкой в тестовом периоде: {len(target_users)}")

Количество пользователей с покупкой в тестовом периоде: 5141


In [11]:
num_cols = [
    "countPoints",
    "Price",
    "DaysUntilNextPurchase",
    "avg_user_item_interval",
    "avg_price",
    "total_points",
    "purchase_count",
    "DaysSinceLastPurchase",
    "Item_popularity",
    "Average_Check",
    "Category_freq",
    "User_Item_Count",
    "days_since_last_purchase_avg",
    "days_since_last_purchase_max",
    "days_since_last_purchase_min",
]

cat_cols = ["Item", "Category", "Is_Favorite_Category", "Price_Segment"]

In [12]:
user_features = train_df.groupby("user_id")[num_cols].mean().reset_index()
for c in cat_cols:
    user_features[c] = train_df.groupby("user_id")[c].last().values

user_features["target"] = user_features["user_id"].isin(target_users).astype(int)

In [13]:
X = user_features[num_cols + cat_cols]
y = user_features["target"]

te = TargetEncoder(cols=cat_cols)
X[cat_cols] = te.fit_transform(X[cat_cols], y)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_cols] = te.fit_transform(X[cat_cols], y)


In [14]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

In [15]:
params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42,
    "scale_pos_weight": scale_pos_weight,
}

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val)

In [16]:
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_val],
    num_boost_round=300,
)

[LightGBM] [Info] Number of positive: 835, number of negative: 10109
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3232
[LightGBM] [Info] Number of data points in the train set: 10944, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076298 -> initscore=-2.493750
[LightGBM] [Info] Start training from score -2.493750


In [17]:
y_pred_val = model.predict(X_val)
df_val = pd.DataFrame(
    {
        "user_id": user_features.iloc[y_val.index]["user_id"],
        "pred_prob": y_pred_val,
        "target": y_val,
    }
)

threshold = 0.7
predicted_users = df_val[df_val["pred_prob"] >= threshold][
    ["user_id", "pred_prob"]
].sort_values("pred_prob", ascending=False)
predicted_users.to_csv("predicted_users.csv", index=False)
print(f"Количество предсказанных пользователей: {len(predicted_users)}")

Количество предсказанных пользователей: 212
