Тетрадка для обучения модели.

# Import

In [29]:
import os
import sys
from dotenv import load_dotenv
import time
from pickle import load
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap


import sklearn
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


load_dotenv()
sklearn.set_config(transform_output="pandas")
pd.options.display.max_columns = None
ROOT_PATH = os.getenv("ROOT_PATH")
sys.path.append(ROOT_PATH)
warnings.filterwarnings("ignore")

In [2]:
from src.metrics import get_smoothed_mean_log_accuracy_ratio

# Config

In [3]:
INPUT_DATA_PATH = os.getenv("INPUT_DATA_PATH")
OUTPUT_DATA_PATH = os.getenv("OUTPUT_DATA_PATH")

# Classes and functions

Функции для перевода временной метки в день недели и час

In [4]:
def hour_to_weekday(hour):
    day = hour // 24
    weekday = day % 7
    return weekday


def hour_to_daytime(hour):
    return hour % 24

Перевод строки в формате "1,2,3,4" в формат [1, 2, 3, 4]

In [5]:
def str_to_list(X):
    for col in X.columns:
        X[col] = X[col].apply(lambda x: [int(v) for v in x.split(",")])
    
    return X

Функции для генерации новых признаков в validate.tsv

In [6]:
def get_interval_times(mask, history):
    users_history = history.loc[mask, ["weekday", "daytime"]]

    top_weekday = np.cos(2*np.pi*users_history["weekday"].mode()[0]/7)
    top_daytime = np.cos(2*np.pi*users_history["daytime"].mode()[0]/24)

    return top_weekday, top_daytime


def get_sessions_features(mask, history):
    # Берем интересующую историю активности
    users_hours = history.loc[mask, ["user_id", "hour", "cpm"]]
    # Группируем по пользователям и считаем сколько времени длилась сессия и какая была цена
    users_sessions = []
    users_cpms = []
    for _, user_data in users_hours.groupby(by="user_id"):
        user_data = user_data.sort_values(by="hour")
        hours = list(user_data["hour"])
        cpms = list(user_data["cpm"])
        # Если пользователь не видел рекламу больше 6 часов, то у него начинается новая сессия
        user_sessions = [hours[0]]
        user_cpms = [cpms[0]]
        for i, hour in enumerate(hours[1:], 1):
            if hour - user_sessions[-1] <= 6:
                user_sessions.append(hour)
                user_cpms.append(cpms[i])
                continue

            users_sessions.append(user_sessions)
            users_cpms.append(user_cpms)
            user_sessions = [hour]
            user_cpms = [cpms[i]]

        users_sessions.append(user_sessions)
        users_cpms.append(user_cpms)

    sessions_duration = round(np.mean([np.max(
        user_sessions) - np.min(user_sessions) + 1 for user_sessions in users_sessions]))
    num_sessions = len(users_sessions)
    sessions_cpm = np.mean([np.mean(user_cpms) for user_cpms in users_cpms])

    return sessions_duration, num_sessions, sessions_cpm

In [7]:
def add_num_publishers(X):
    return X["publishers"].apply(len)


def add_used_publishers_ratio(X, history):
    new_feature = {
        "used_publishers_ration": []
    }
    for _, row in X.iterrows():
        # Берем записи с пользователями и площадкам из объявления
        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        users_publishers = history.loc[mask, ["publisher", "user_id"]]
        # Для каждого пользователя считаем сколько подходящих групп он использовал и берем популярное кол-во
        n_used_publishers = users_publishers.groupby("user_id")["publisher"].apply(lambda publishers: len(np.unique(publishers))).mode()[0]
        # Считаем отношение
        used_publishers_ration = n_used_publishers/len(row["publishers"])

        new_feature["used_publishers_ration"].append(used_publishers_ration)

    return pd.DataFrame(new_feature)


def add_top_intervals(X, history):
    new_feature = {
        "top_weekday": [],
        "top_daytime": [],
        "top_weekday_on_pubs": [],
        "top_daytime_on_pubs": []
    }
    for _, row in X.iterrows():
        mask = history["user_id"].isin(row["user_ids"])
        top_weekday, top_daytime = get_interval_times(mask, history)

        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        top_weekday_on_pubs, top_daytime_on_pubs = get_interval_times(mask, history)

        new_feature["top_weekday"].append(top_weekday)
        new_feature["top_daytime"].append(top_daytime)
        new_feature["top_weekday_on_pubs"].append(top_weekday_on_pubs)
        new_feature["top_daytime_on_pubs"].append(top_daytime_on_pubs)

    return pd.DataFrame(new_feature)


def add_session_features(X, history):
    new_feature = {
        "sessions_duration": [],
        "sessions_duration_on_pubs": [],

        "num_sessions": [],
        "num_sessions_on_pubs": [],

        "sessions_cpm": [],
        "sessions_cpm_on_pubs": []
    }
    for _, row in X.iterrows():
        # На всех площадках
        mask = (history["user_id"].isin(row["user_ids"]))
        sessions_duration, num_sessions, sessions_cpm = get_sessions_features(mask, history)
        # На конкретных площадках
        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        sessions_duration_on_pubs, num_sessions_on_pubs, sessions_cpm_on_pubs = get_sessions_features(mask, history)

        new_feature["sessions_duration"].append(sessions_duration)
        new_feature["sessions_duration_on_pubs"].append(sessions_duration_on_pubs)

        new_feature["num_sessions"].append(num_sessions)
        new_feature["num_sessions_on_pubs"].append(num_sessions_on_pubs)

        new_feature["sessions_cpm"].append(sessions_cpm)
        new_feature["sessions_cpm_on_pubs"].append(sessions_cpm_on_pubs)

    return pd.DataFrame(new_feature)


def add_views(X, history):
    new_feature = {
        "num_views": [],
        "num_views_on_pubs": []
    }
    for _, row in X.iterrows():
        num_views = (history["user_id"].isin(row["user_ids"])).sum()
        num_views_on_pubs = ((history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))).sum()

        new_feature["num_views"].append(num_views)
        new_feature["num_views_on_pubs"].append(num_views_on_pubs)

    return pd.DataFrame(new_feature)


def add_times(X):
    X["weekday_start"] = X["hour_start"]//24%7
    X["weekday_end"] = X["hour_end"]//24%7
    X["daytime_start"] = X["hour_start"]%24
    X["daytime_end"] = X["hour_end"]%24
    X["duration"] = X["hour_end"] - X["hour_start"]
    return X


def add_publishers_data(X, history):
    new_features = {
        "publishers_mean_cpm_bid": [],
        "publishers_IQR_cpm_bid": []
    }
    for _, row in X.iterrows():
        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        users_hours = history.loc[mask, ["cpm"]]

        publishers_mean_cpm_bid = np.mean(users_hours["cpm"])
        publishers_IQR_cpm_bid = users_hours["cpm"].quantile(0.75) - users_hours["cpm"].quantile(0.25)

        new_features["publishers_mean_cpm_bid"].append(publishers_mean_cpm_bid)
        new_features["publishers_IQR_cpm_bid"].append(publishers_IQR_cpm_bid)

    return pd.DataFrame(new_features)

Функция для подсчета метрики

In [8]:
def get_smoothed_log_mape_column_value(responses_column, answers_column, epsilon):
    return np.abs(np.log(
        (responses_column + epsilon)
        / (answers_column + epsilon)
    )).mean()


def get_smoothed_mean_log_accuracy_ratio_np(answers, responses, epsilon=0.005):
    answers = answers.to_numpy().reshape(len(responses), 3)
    log_accuracy_ratio_mean = np.array(
        [
            get_smoothed_log_mape_column_value(responses[:, 0], answers[:, 0], epsilon),
            get_smoothed_log_mape_column_value(responses[:, 1], answers[:, 1], epsilon),
            get_smoothed_log_mape_column_value(responses[:, 2], answers[:, 2], epsilon),
        ]
    ).mean()
    
    percentage_error = 100 * (np.exp(log_accuracy_ratio_mean) - 1)

    return percentage_error.round(
        decimals=2
    )

Функции для визуализации

In [9]:
def get_shap_values(model, X_test, y_test, save_dir_path=None):
    # Для каждой целевой переменной
    for idx, target_name in enumerate(y_test.columns):
        # Создадим экземпляр класса shap
        explainer = shap.Explainer(model.estimators_[idx])
        # Получим shap значений
        shap_values = explainer(X_test)
        # Отрисуем необходимые диаграммы
        plt.figure(figsize=(8, 6)) 
        shap.plots.bar(shap_values, show=False, max_display=len(X_test.columns))
        plt.ylim(0.5, X_test.shape[1] + 1)
        if save_dir_path:
            plt.savefig(os.path.join(save_dir_path, f"barplot_{target_name}.png"), dpi=300, bbox_inches="tight")

        plt.close()

        plt.figure(figsize=(8, 6)) 
        shap.plots.beeswarm(shap_values, show=False, max_display=len(X_test.columns))
        plt.ylim(0.5, X_test.shape[1] + 1)
        if save_dir_path:
            plt.savefig(os.path.join(save_dir_path, f"beeswarm_{target_name}.png"), dpi=300, bbox_inches="tight")

        plt.close()

Функция для кросс-валидации

In [10]:
def ccross_val(model, X, y, verbose=True):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = {
        "train_times": [],
        "inference_times": [],
        "metrics": []
    }
    for train_idx, test_idx in cv.split(X):
        # Обучаем модель
        start = time.time()

        model.fit(X.iloc[train_idx], y.iloc[train_idx])

        train_time = time.time() - start
        # Получаем предсказания
        start = time.time()

        y_pred = model.predict(X.iloc[test_idx])

        inference_time = time.time() - start
        # Делаем постобработку
        y_pred[y_pred < 0.] = 0.
        # Считаем метрику
        metric = get_smoothed_mean_log_accuracy_ratio_np(y.iloc[test_idx], y_pred)

        scores["train_times"].append(train_time)
        scores["inference_times"].append(inference_time)
        scores["metrics"].append(metric)

    if verbose:
        print(f"Train batch size: {len(train_idx)}\nTest batch size: {len(test_idx)}")
        print(f"Mean train time: {round(np.mean(scores['train_times']), 2)} s")
        print(f"Mean inference time: {round(np.mean(scores['inference_times']), 2)} s")
        print(f"Mean metric: {round(np.mean(scores['metrics']), 2)}")

# Load data

### load

Загружаем данные. В целях экономии ресурсов мы загружаем предобработанные users и history.

In [15]:
ext_user_ds_path = os.path.join(OUTPUT_DATA_PATH, "ext_users.csv")
ext_history_ds_path = os.path.join(OUTPUT_DATA_PATH, "ext_history.csv")
validate_ds_path = os.path.join(INPUT_DATA_PATH, "validate.tsv")
validate_answers_ds_path = os.path.join(INPUT_DATA_PATH, "validate_answers.tsv")

ext_user_ds = pd.read_csv(ext_user_ds_path)
ext_history_ds = pd.read_csv(ext_history_ds_path)
validate_ds = pd.read_csv(validate_ds_path, sep="\t")
validate_answers_ds = pd.read_csv(validate_answers_ds_path, sep="\t")

### preprocess

Делаем предобработку данных и добавляем новые признаки.

In [12]:
ct = make_column_transformer(
    (FunctionTransformer(str_to_list), ["publishers", "user_ids"]),

    remainder='passthrough',
    verbose_feature_names_out=False
)

fe_ct = ColumnTransformer([
    ("add_num_publishers",
     FunctionTransformer(func=add_num_publishers,
                         feature_names_out=lambda *_: ["num_publishers"]),
     ["publishers"]),

     ("add_used_publishers_ratio",
     FunctionTransformer(func=add_used_publishers_ratio,
                         feature_names_out=lambda *_: ["used_publishers_ration"],
                         kw_args={"history": ext_history_ds}),
     ["user_ids", "publishers"]),

     ("add_top_intervals",
     FunctionTransformer(func=add_top_intervals,
                         feature_names_out=lambda *_: ["top_weekday", "top_daytime", "top_weekday_on_pubs", "top_daytime_on_pubs"],
                         kw_args={"history": ext_history_ds}),
    ["user_ids", "publishers"]),

    ("add_session_features",
     FunctionTransformer(func=add_session_features,
                         feature_names_out=lambda *_: ["sessions_duration", "sessions_duration_on_pubs", "num_sessions", "num_sessions_on_pubs", "sessions_cpm", "sessions_cpm_on_pubs"],
                         kw_args={"history": ext_history_ds}),
    ["user_ids", "publishers"]),
     
    ("add_views",
     FunctionTransformer(func=add_views,
                         feature_names_out=lambda *_: ["num_views", "num_views_on_pubs"],
                         kw_args={"history": ext_history_ds}),
    ["user_ids", "publishers"]),
    ("add_times",
     FunctionTransformer(func=add_times,
                         feature_names_out=lambda *_: ["hour_start", "hour_end", "weekday_start", "weekday_end", "daytime_start", "daytime_end", "duration"]),
    ["hour_start", "hour_end"]),
    ("add_publishers_data",
     FunctionTransformer(func=add_publishers_data,
                         feature_names_out=lambda *_: ["publishers_mean_cpm_bid", "publishers_IQR_cpm_bid"],
                         kw_args={"history": ext_history_ds}),
     ["user_ids", "publishers"])
    ],
    remainder='passthrough',
    n_jobs=12,
    verbose=True,
    verbose_feature_names_out=False
)

In [16]:
ext_validate_ds = ct.fit_transform(validate_ds)
ext_validate_ds = fe_ct.fit_transform(ext_validate_ds)
ext_validate_ds.head()

[ColumnTransformer]  (1 of 8) Processing add_num_publishers, total=   0.0s
[ColumnTransformer] ..... (6 of 8) Processing add_times, total=   0.0s
[ColumnTransformer] ..... (8 of 8) Processing remainder, total=   0.0s
[ColumnTransformer]  (7 of 8) Processing add_publishers_data, total=  35.9s
[ColumnTransformer] ..... (5 of 8) Processing add_views, total=  50.6s
[ColumnTransformer]  (3 of 8) Processing add_top_intervals, total=  59.1s
[ColumnTransformer]  (2 of 8) Processing add_used_publishers_ratio, total= 1.0min
[ColumnTransformer]  (4 of 8) Processing add_session_features, total=15.0min


Unnamed: 0,num_publishers,used_publishers_ration,top_weekday,top_daytime,top_weekday_on_pubs,top_daytime_on_pubs,sessions_duration,sessions_duration_on_pubs,num_sessions,num_sessions_on_pubs,sessions_cpm,sessions_cpm_on_pubs,num_views,num_views_on_pubs,hour_start,hour_end,weekday_start,weekday_end,daytime_start,daytime_end,duration,publishers_mean_cpm_bid,publishers_IQR_cpm_bid,cpm,audience_size
0,2,0.5,-0.900969,0.258819,1.0,-0.965926,2,2,37327,2661,221.736712,304.767717,77321,4598,1058,1153,2,6,2,1,95,253.183282,197.415,220.0,1906
1,2,0.5,-0.222521,0.258819,-0.900969,0.5,3,3,70595,3922,182.222638,189.190416,178007,10967,1295,1301,4,5,23,5,6,173.083709,151.78,312.0,1380
2,6,0.166667,-0.900969,0.5,0.62349,0.5,2,2,18025,16165,231.532268,232.619346,39052,34691,1229,1249,2,3,5,1,20,197.274887,150.0,70.0,888
3,2,0.5,-0.222521,-0.965926,-0.222521,0.258819,2,2,8304,5054,215.579714,224.737393,17013,10146,1295,1377,4,1,23,9,82,195.917368,160.855,240.0,440
4,4,0.25,-0.222521,0.5,1.0,0.5,2,2,25286,20591,266.37006,272.764926,52434,42564,752,990,3,6,8,6,238,234.729084,188.665,262.0,1476


# XGBRegression

### setup

Инициализируем модель. Ниже представлена используемая модель с параметрами.

In [None]:
# xgb = XGBRegressor(
#     device="cuda",
#     random_state=42,

#     n_estimators=1000,
#     learning_rate=0.05,
#     gamma=0,
#     max_depth=6,
#     reg_lambda=1,
#     tree_method="hist",

#     objective="reg:gamma"
# )

# model = MultiOutputRegressor(
#     estimator=xgb,
#     n_jobs=8
# )

In [32]:
with open(os.path.join(OUTPUT_DATA_PATH, "xgbr_cek2.pkl"), "rb") as f:
    MODEL = load(f)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(ext_validate_ds, validate_answers_ds, test_size=0.2, random_state=42)

### evaluate

Запускаем модель и считаем метрику качества.

In [39]:
y_pred = MODEL.predict(X_test)
y_pred[y_pred < 0.] = 0.
print(get_smoothed_mean_log_accuracy_ratio_np(y_test, y_pred))

29.47


Визуализация SHAP значений.

In [None]:
save_dir_path = os.path.join(OUTPUT_DATA_PATH, "plots/shap")
os.makedirs(save_dir_path, exist_ok=True)
get_shap_values(MODEL, X_test, y_test, save_dir_path)

Прогон модели на фолдах.

In [40]:
ccross_val(MODEL, ext_validate_ds, validate_answers_ds+0.0005)

Train batch size: 807
Test batch size: 201
Mean train time: 9.56 s
Mean inference time: 0.08 s
Mean metric: 25.91
