In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, RobustScaler
import holidays
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime, timedelta

In [2]:
dataframe = pd.read_csv("../data_anonymized_with_user_ids.csv")

In [3]:
dataframe = dataframe.drop(columns=["FirstName"])
dataframe = dataframe.drop(columns=["town"])
dataframe = dataframe.drop(columns=["FullName"])

In [4]:
one_hot_enc = OneHotEncoder(sparse_output=False)
encoded_array = one_hot_enc.fit_transform(dataframe[["Format"]])

encoded_df = pd.DataFrame(
    encoded_array, columns=one_hot_enc.get_feature_names_out(["Format"])
)
dataframe = pd.concat([dataframe.drop(columns=["Format"]), encoded_df], axis=1)

In [5]:
dataframe.rename(
    columns={"Format_Оффлайн": "Fromat_Offline", "Format_Трейд Ин": "Format_Trade_In"},
    inplace=True,
)

In [6]:
dataframe["Price"] = (
    dataframe["Price"].astype(str).str.replace(r"[\s,]", "", regex=True).astype(float)
)

In [7]:
dataframe["countPoints"] = (
    dataframe["countPoints"]
    .astype(str)
    .str.replace(r"[\s,]", "", regex=True)
    .astype(float)
)

In [8]:
dataframe["Date"] = pd.to_datetime(
    dataframe["Date"], format="%Y-%m-%d", errors="coerce"
)
dataframe = dataframe.sort_values(["user_id", "Date"])

In [9]:
products_type = [
    "Серьги",
    "Серьга",
    "Кольцо",
    "Обручальное кольцо",
    "Цепь",
    "Подвеска",
    "Браслет",
    "Коробка универсальная черная SOKOLOV",
    "Колье",
    "Брошь",
    "сертификат",
    "Пирсинг",
    "Булавка",
    "Сюрприз-бокс",
    "Коробка универсальная белая SOKOLOV",
    "Часы наручные",
    "Пакет малый бумажный черный SOKOLOV",
    "Пакет малый бумажный белый SOKOLOV",
    "Ложка",
    "Шарм",
    "Часы ювелирные",
    "Коробка красная квадратная",
    "Подарочная подвеска",
    "Запонки",
    "Коробка черная квадратная",
    "Сувенир",
    "Столовый прибор",
    "Коробка красная длинная",
    "Шнур",
    "Удлинитель",
    "Зажимы для галстука",
    "Кружка",
    "Брелок",
    "Вилка",
]

pattern_products = "|".join([re.escape(item) for item in products_type])


def clean_item(text):
    if pd.isna(text):
        return text

    text = str(text)

    match = re.search(rf"({pattern_products})", text, flags=re.IGNORECASE)
    if match:
        start_idx = match.start()
        text = text[start_idx:]

    text = re.sub(r"[_]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = re.sub(r"(?:\s*\d[\d\s\xa0]*)$", "", text)

    text = text.strip(".- ")

    return text

In [10]:
dataframe["Item"] = dataframe["Item"].apply(clean_item)

In [11]:
def count_occurrences(data, patterns):
    counts = {p: 0 for p in patterns}

    for item in data:
        if isinstance(item, str):
            lower_item = item.lower()
            for p in patterns:
                if p.lower() in lower_item:
                    counts[p] += 1

    return counts


patterns = ["Сталь 0", "Ag", "Au", "ювелирное изделие", "месяц"]
materials = dataframe["Item"].tolist()

result = count_occurrences(materials, patterns)
print(result)

{'Сталь 0': 303, 'Ag': 22147, 'Au': 12791, 'ювелирное изделие': 2019, 'месяц': 229}


In [12]:
def extract_material_full(item):
    if pd.isna(item):
        return None

    match = re.search(r"\(([^)]+)\)", item)
    if match:
        return match.group(1).strip()
    return None


dataframe["Material"] = dataframe["Item"].apply(extract_material_full)

In [13]:
dataframe["Item"] = dataframe["Item"].str.replace(r"\s*\([^)]*\)", "", regex=True).str.strip()

In [14]:
dataframe["Material"] = dataframe["Material"].fillna("Other")

In [15]:
encoded_array = one_hot_enc.fit_transform(dataframe[["Material"]])

encoded_df = pd.DataFrame(
    encoded_array, columns=one_hot_enc.get_feature_names_out(["Material"])
)
dataframe = pd.concat([dataframe.drop(columns=["Material"]), encoded_df], axis=1)

In [16]:
dataframe.rename(
    columns={
        "Material_6 месяцев": "sertificate_6_months",
        "Material_Ag 800": "Ag_800",
        "Material_Ag 925": "Ag_925",
        "Material_Au 375": "Au_375",
        "Material_Au 585": "Au_585",
        "Material_Other": "Other_materials",
        "Material_Сталь 0": "Steal_0",
        "Material_ювелирное изделие": "Jewelry",
    },
    inplace=True,
)

In [17]:
dataframe["Date"] = pd.to_datetime(dataframe["Date"], dayfirst=True)

dates = dataframe["Date"]
kz_holidays = holidays.KZ(years=dates.dt.year.unique())

dataframe["IsHoliday"] = dataframe["Date"].isin(kz_holidays)

  dataframe["IsHoliday"] = dataframe["Date"].isin(kz_holidays)


In [18]:
dataframe["Month"] = dataframe["Date"].dt.month
dataframe["Weekday"] = dataframe["Date"].dt.weekday
dataframe["Quarter"] = dataframe["Date"].dt.quarter

In [19]:
dataframe["Month_sin"] = np.sin(2 * np.pi * dataframe["Month"] / 12)
dataframe["Month_cos"] = np.cos(2 * np.pi * dataframe["Month"] / 12)
dataframe["Weekday_sin"] = np.sin(2 * np.pi * dataframe["Weekday"] / 7)
dataframe["Weekday_cos"] = np.cos(2 * np.pi * dataframe["Weekday"] / 7)
dataframe["Quarter_sin"] = np.sin(2 * np.pi * dataframe["Quarter"] / 4)
dataframe["Quarter_cos"] = np.cos(2 * np.pi * dataframe["Quarter"] / 4)

In [20]:
dataframe["NextPurchaseDate"] = dataframe.groupby("user_id")["Date"].shift(-1)

In [21]:
dataframe["DaysUntilNextPurchase"] = (dataframe["NextPurchaseDate"] - dataframe["Date"]).dt.days

In [22]:
N_days = [7, 14, 30, 60, 90]
for n in N_days:
    dataframe[f"Purchase_in_next_{n}d"] = (dataframe["DaysUntilNextPurchase"] <= n).astype(int)

In [23]:
df = dataframe.copy()
real_prices = df[df["Price"] > 1]["Price"]

In [24]:
low_q = real_prices.quantile(0.25)
mid_q = real_prices.quantile(0.50)

In [25]:
def price_segment(price):
    if price == 1:
        return "Gift"
    elif price < low_q:
        return "Economy"
    elif price < mid_q:
        return "Middle"
    else:
        return "Premium"


dataframe["Price_Segment"] = df["Price"].apply(price_segment)

In [26]:
user_stats = (
    dataframe.groupby("user_id")
    .agg(
        avg_user_item_interval=("DaysUntilNextPurchase", "mean"),
        avg_price=("Price", "mean"),
        total_points=("countPoints", "max"),
        purchase_count=("Date", "count"),
    )
    .reset_index()
)
dataframe = dataframe.merge(user_stats, on="user_id", how="left")

In [27]:
sort_user_date = dataframe.sort_values(["user_id", "Date"])

dataframe["LastPurchaseDate"] = dataframe.groupby("user_id")["Date"].shift(1)
dataframe["DaysSinceLastPurchase"] = (dataframe["Date"] - dataframe["LastPurchaseDate"]).dt.days

dataframe["DaysSinceLastPurchase"] = dataframe["DaysSinceLastPurchase"].fillna(0)

In [28]:
freq_map = dataframe["Item"].value_counts(normalize=True)
dataframe["Item_popularity"] = dataframe["Item"].map(freq_map)

In [29]:
windows = [7, 14, 30, 60, 90]


def compute_frequency(df, days):
    freq = []
    for user_id, group in df.groupby("user_id"):
        group = group.sort_values("Date").reset_index(drop=True)
        user_counts = []
        dates = group["Date"]
        for current_idx, current_date in enumerate(dates):
            start_date = current_date - timedelta(days=days)
            count = group[
                (group["Date"] > start_date) & (group["Date"] <= current_date)
            ].shape[0]
            user_counts.append(count)
        freq.extend(user_counts)
    return freq


for w in windows:
    df[f"Frequency_{w}d"] = compute_frequency(df, w)

In [30]:
dataframe = dataframe.sort_values(['user_id', 'Date'])
dataframe['Last_Purchase_Date'] = dataframe.groupby('user_id')['Date'].shift(1)
dataframe['Days_Since_Last_Purchase'] = (dataframe['Date'] - dataframe['Last_Purchase_Date']).dt.days
dataframe['Days_Since_Last_Purchase'] = dataframe['Days_Since_Last_Purchase'].fillna(0)

In [31]:
avg_check = dataframe.groupby("user_id")["Price"].mean().reset_index()
avg_check = avg_check.rename(columns={"Price": "Average_Check"})
dataframe = dataframe.merge(avg_check, on="user_id", how="left")

In [32]:
price_pref = (
    dataframe.groupby("user_id")["Price"]
    .median()
    .reset_index(name="Price_Preference")
)
dataframe = dataframe.merge(price_pref, on="user_id", how="left")

In [33]:
category = {
    **dict.fromkeys(
        [
            "Серьги",
            "Серьга",
            "Кольцо",
            "Цепь",
            "Подвеска",
            "Браслет",
            "Колье",
            "Брошь",
            "Пирсинг",
            "Шарм",
        ],
        "Jewelries",
    ),
    **dict.fromkeys(
        [
            "Коробка универсальная черная sokolov",
            "Коробка универсальная белая sokolov",
            "Коробка красная квадратная",
            "Коробка черная квадратная",
            "Коробка красная длинная",
            "Пакет малый бумажный черный SOKOLOV",
            "Пакет малый бумажный белый SOKOLOV",
        ],
        "Packages",
    ),
    **dict.fromkeys(
        [
            "Сюрприз-бокс",
            "Сертификат",
            "Сувенир",
        ],
        "Gifts",
    ),
    **dict.fromkeys(
        [
            "Часы наручные",
            "Часы ювелирные",
            "Зажимы для галстука",
            "Запонки",
            "Брелок",
            "Булавка",
        ],
        "Accessories",
    ),
    **dict.fromkeys(["Ложка", "Вилка", "Кружка", "Столовый прибор"], "Tablewares"),
    **dict.fromkeys(["Шнур", "Удлинитель"], "Electronics"),
}

In [34]:
dataframe["Category"] = dataframe["Item"].map(category)

In [35]:
favorite_category = (
    dataframe.groupby(["user_id", "Category"])["Item"]
    .count()
    .reset_index(name="Category_Count")
)

favorite_category = favorite_category.loc[
    favorite_category.groupby("user_id")["Category_Count"].idxmax()
][["user_id", "Category"]].rename(columns={"Category": "Favorite_Category"})
dataframe = dataframe.merge(favorite_category, on="user_id", how="left")

In [36]:
dataframe['Is_Favorite_Category'] = (dataframe['Category'] == dataframe['Favorite_Category']).astype(int)

In [37]:
label_enc = LabelEncoder()

dataframe["Price_Segment"] = label_enc.fit_transform(dataframe["Price_Segment"])
print(label_enc.classes_)

['Economy' 'Gift' 'Middle' 'Premium']


In [38]:
freq_encoding = dataframe['Category'].value_counts()

dataframe['Category_freq'] = dataframe['Category'].map(freq_encoding)

In [39]:
dataframe["IsHoliday"] = dataframe["IsHoliday"].astype(int)

In [40]:
user_item_count = dataframe.groupby(["user_id","Item"]).size().reset_index(name="User_Item_Count")
dataframe = dataframe.merge(user_item_count, on=["user_id","Item"], how="left")

In [41]:
dataframe["prev_date"] = dataframe.groupby(["user_id", "Item"])["Date"].shift(1)
dataframe["days_since_last_purchase"] = (
    dataframe["Date"] - dataframe["prev_date"]
).dt.days

In [42]:
dataframe["days_since_last_purchase"] = dataframe.groupby("user_id")[
    "days_since_last_purchase"
].transform(lambda x: x.fillna(x.median()))

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [43]:
agg = (
    dataframe.groupby(["user_id", "Item"])["days_since_last_purchase"]
    .agg(
        days_since_last_purchase_avg="mean",
        days_since_last_purchase_max="max",
        days_since_last_purchase_min="min",
    )
    .reset_index()
)

dataframe = dataframe.merge(agg, on=["user_id", "Item"], how="left")

In [44]:
dataframe.head()

Unnamed: 0,user_id,countPoints,Item,Date,Price,Fromat_Offline,Format_Trade_In,sertificate_6_months,Ag_800,Ag_925,...,Category,Favorite_Category,Is_Favorite_Category,Category_freq,User_Item_Count,prev_date,days_since_last_purchase,days_since_last_purchase_avg,days_since_last_purchase_max,days_since_last_purchase_min
0,USER_100054,0.0,Кольцо,2024-03-30,51175.0,1.0,0.0,0.0,0.0,1.0,...,Jewelries,Jewelries,1,33562.0,1,NaT,,,,
1,USER_100070,1444.0,Сюрприз-бокс,2025-07-20,4950.0,1.0,0.0,0.0,0.0,0.0,...,Gifts,Gifts,1,2027.0,3,NaT,0.0,0.0,0.0,0.0
2,USER_100070,1444.0,Сюрприз-бокс,2025-07-20,4950.0,1.0,0.0,0.0,0.0,1.0,...,Gifts,Gifts,1,2027.0,3,2025-07-20,0.0,0.0,0.0,0.0
3,USER_100070,1444.0,Сюрприз-бокс,2025-07-20,4950.0,1.0,0.0,0.0,0.0,0.0,...,Gifts,Gifts,1,2027.0,3,2025-07-20,0.0,0.0,0.0,0.0
4,USER_100196,605.0,Серьги,2025-07-01,20172.0,1.0,0.0,0.0,0.0,0.0,...,Jewelries,Jewelries,1,33562.0,1,NaT,,,,


In [45]:
dataframe.columns

Index(['user_id', 'countPoints', 'Item', 'Date', 'Price', 'Fromat_Offline',
       'Format_Trade_In', 'sertificate_6_months', 'Ag_800', 'Ag_925', 'Au_375',
       'Au_585', 'Other_materials', 'Steal_0', 'Jewelry', 'IsHoliday', 'Month',
       'Weekday', 'Quarter', 'Month_sin', 'Month_cos', 'Weekday_sin',
       'Weekday_cos', 'Quarter_sin', 'Quarter_cos', 'NextPurchaseDate',
       'DaysUntilNextPurchase', 'Purchase_in_next_7d', 'Purchase_in_next_14d',
       'Purchase_in_next_30d', 'Purchase_in_next_60d', 'Purchase_in_next_90d',
       'Price_Segment', 'avg_user_item_interval', 'avg_price', 'total_points',
       'purchase_count', 'LastPurchaseDate', 'DaysSinceLastPurchase',
       'Item_popularity', 'Last_Purchase_Date', 'Days_Since_Last_Purchase',
       'Average_Check', 'Price_Preference', 'Category', 'Favorite_Category',
       'Is_Favorite_Category', 'Category_freq', 'User_Item_Count', 'prev_date',
       'days_since_last_purchase', 'days_since_last_purchase_avg',
       'days_si

In [46]:
dataframe['Ag_800'].unique()

array([0., 1.])

In [47]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39480 entries, 0 to 39479
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   user_id                       39480 non-null  object        
 1   countPoints                   39480 non-null  float64       
 2   Item                          39480 non-null  object        
 3   Date                          39480 non-null  datetime64[ns]
 4   Price                         39480 non-null  float64       
 5   Fromat_Offline                39480 non-null  float64       
 6   Format_Trade_In               39480 non-null  float64       
 7   sertificate_6_months          39480 non-null  float64       
 8   Ag_800                        39480 non-null  float64       
 9   Ag_925                        39480 non-null  float64       
 10  Au_375                        39480 non-null  float64       
 11  Au_585                      

In [None]:
dataframe.drop(
    columns=[
        "prev_date",
        "Favorite_Category",
        "LastPurchaseDate",
        "Last_Purchase_Date",
    ],
    inplace=True,
)

In [None]:
dataframe.to_csv("../dataframe/prepared_data_for_predict_next_purchase.csv", index=False)