Тетрадка для обучения модели.

# Import

In [78]:
import os
import sys
from dotenv import load_dotenv
from tqdm import tqdm
import time
from pickle import dump, load
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import ast
from scipy import stats

import sklearn
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

from xgboost import XGBRegressor, DeviceQuantileDMatrix


load_dotenv()
sklearn.set_config(transform_output="pandas")
pd.options.display.max_columns = None
ROOT_PATH = os.getenv("ROOT_PATH")
sys.path.append(ROOT_PATH)
warnings.filterwarnings("ignore")

In [3]:
from src.metrics import get_smoothed_mean_log_accuracy_ratio

# Config

In [4]:
INPUT_DATA_PATH = os.getenv("INPUT_DATA_PATH")
OUTPUT_DATA_PATH = os.getenv("OUTPUT_DATA_PATH")

# Classes and functions

Функции для перевода временной метки в день недели и час

In [5]:
def hour_to_weekday(hour):
    day = hour // 24
    weekday = day % 7
    return weekday


def hour_to_daytime(hour):
    return hour % 24

Перевод строки в формате "1,2,3,4" в формат [1, 2, 3, 4]

In [6]:
def str_to_list(X):
    for col in X.columns:
        X[col] = X[col].apply(lambda x: [int(v) for v in x.split(",")])
    
    return X

Функции для генерации новых признаков в validate.tsv

In [None]:
def get_interval_times(mask, history):
    users_history = history.loc[mask, ["weekday", "daytime"]]

    top_weekday = np.cos(2*np.pi*users_history["weekday"].mode()[0]/7)
    top_daytime = np.cos(2*np.pi*users_history["daytime"].mode()[0]/24)

    return top_weekday, top_daytime


def get_sessions_features(mask, history):
    # Берем интересующую историю активности
    users_hours = history.loc[mask, ["user_id", "hour", "cpm"]]
    # Группируем по пользователям и считаем сколько времени длилась сессия и какая была цена
    users_sessions = []
    users_cpms = []
    for _, user_data in users_hours.groupby(by="user_id"):
        user_data = user_data.sort_values(by="hour")
        hours = list(user_data["hour"])
        cpms = list(user_data["cpm"])
        # Если пользователь не видел рекламу больше 6 часов, то у него начинается новая сессия
        user_sessions = [hours[0]]
        user_cpms = [cpms[0]]
        for i, hour in enumerate(hours[1:], 1):
            if hour - user_sessions[-1] <= 6:
                user_sessions.append(hour)
                user_cpms.append(cpms[i])
                continue

            users_sessions.append(user_sessions)
            users_cpms.append(user_cpms)
            user_sessions = [hour]
            user_cpms = [cpms[i]]

        users_sessions.append(user_sessions)
        users_cpms.append(user_cpms)

    sessions_duration = round(np.mean([np.max(
        user_sessions) - np.min(user_sessions) + 1 for user_sessions in users_sessions]))
    num_sessions = len(users_sessions)
    sessions_cpm = np.mean([np.mean(user_cpms) for user_cpms in users_cpms])

    return sessions_duration, num_sessions, sessions_cpm

In [None]:
def add_num_publishers(X):
    return X["publishers"].apply(len)


def add_used_publishers_ratio(X, history):
    new_feature = {
        "used_publishers_ration": []
    }
    for _, row in X.iterrows():
        # Берем записи с пользователями и площадкам из объявления
        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        users_publishers = history.loc[mask, ["publisher", "user_id"]]
        # Для каждого пользователя считаем сколько подходящих групп он использовал и берем популярное кол-во
        n_used_publishers = users_publishers.groupby("user_id")["publisher"].apply(lambda publishers: len(np.unique(publishers))).mode()[0]
        # Считаем отношение
        used_publishers_ration = n_used_publishers/len(row["publishers"])

        new_feature["used_publishers_ration"].append(used_publishers_ration)

    return pd.DataFrame(new_feature)


def add_top_intervals(X, history):
    new_feature = {
        "top_weekday": [],
        "top_daytime": [],
        "top_weekday_on_pubs": [],
        "top_daytime_on_pubs": []
    }
    for _, row in X.iterrows():
        mask = history["user_id"].isin(row["user_ids"])
        top_weekday, top_daytime = get_interval_times(mask, history)

        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        top_weekday_on_pubs, top_daytime_on_pubs = get_interval_times(mask, history)

        new_feature["top_weekday"].append(top_weekday)
        new_feature["top_daytime"].append(top_daytime)
        new_feature["top_weekday_on_pubs"].append(top_weekday_on_pubs)
        new_feature["top_daytime_on_pubs"].append(top_daytime_on_pubs)

    return pd.DataFrame(new_feature)


def add_session_features(X, history):
    new_feature = {
        "sessions_duration_on_pubs": [],
        "num_sessions_on_pubs": [],
        "sessions_cpm_on_pubs": []
    }
    for _, row in X.iterrows():
        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        sessions_duration_on_pubs, num_sessions_on_pubs, sessions_cpm_on_pubs = get_sessions_features(mask, history)

        new_feature["sessions_duration_on_pubs"].append(sessions_duration_on_pubs)
        new_feature["num_sessions_on_pubs"].append(num_sessions_on_pubs)
        new_feature["sessions_cpm_on_pubs"].append(sessions_cpm_on_pubs)

    return pd.DataFrame(new_feature)


def add_views(X, history):
    new_feature = {
        "num_views": [],
        "num_views_on_pubs": []
    }
    for _, row in X.iterrows():
        num_views = (history["user_id"].isin(row["user_ids"])).sum()
        num_views_on_pubs = ((history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))).sum()

        new_feature["num_views"].append(num_views)
        new_feature["num_views_on_pubs"].append(num_views_on_pubs)

    return pd.DataFrame(new_feature)


def add_times(X):
    # X["weekday_start"] = X["hour_start"]//24%7
    # X["weekday_end"] = X["hour_end"]//24%7
    # X["daytime_start"] = X["hour_start"]%24
    # X["daytime_end"] = X["hour_end"]%24
    # X["duration"] = X["hour_end"] - X["hour_start"]
    return X["hour_end"] - X["hour_start"]


def add_publishers_data(X, history):
    new_features = {
        "publishers_mean_cpm_bid": [],
        "publishers_IQR_cpm_bid": []
    }
    for _, row in X.iterrows():
        mask = (history["user_id"].isin(row["user_ids"])) & (history["publisher"].isin(row["publishers"]))
        users_cpms = history.loc[mask, ["cpm"]]

        publishers_mean_cpm_bid = np.mean(users_cpms["cpm"])
        publishers_IQR_cpm_bid = users_cpms["cpm"].quantile(0.75) - users_cpms["cpm"].quantile(0.25)

        new_features["publishers_mean_cpm_bid"].append(publishers_mean_cpm_bid)
        new_features["publishers_IQR_cpm_bid"].append(publishers_IQR_cpm_bid)

    return pd.DataFrame(new_features)


def add_users_data(X, users, history):
    new_features = {
        "views_mean_week_male": [],
        "views_mean_week_female": [],
        "age_bins_cpv_week_mean": []
    }
    for _, row in X.iterrows():
        users_slice = users[users["user_id"].isin(row["user_ids"])]
        mask = history["user_id"].isin(row["user_ids"]) & history["publisher"].isin(row["publishers"])
        history_slice = history.loc[mask]
        # Совместим пользователей и их историю
        merged_df = users_slice.merge(history_slice, on="user_id", how="left")
        merged_df = merged_df.dropna()
        
        # Посчитаем сколько в среднем за неделю приносит мужчина/женщина
        views_mean_week_male = merged_df[merged_df["sex"] == 0].groupby(['week'], observed=True)['cpm'].size().mean()
        views_mean_week_female = merged_df[merged_df["sex"] == 1].groupby(['week'], observed=True)['cpm'].size().mean()
        
        new_features["views_mean_week_male"].append(views_mean_week_male)
        new_features["views_mean_week_female"].append(views_mean_week_female)

        # Поделим пользователей по возрасту на 5 групп
        merged_df['age_group'] = pd.cut(merged_df['age'], bins=[0, 18, 25, 35, 50, 90], labels=['0-18', '19-25', '26-35', '36-50', '50+'])
        # Посчитаем сколько каждая группа приносит cpm и просмотров
        age_bins_cpm_week_mean = merged_df.groupby(['age_group', 'week'], observed=True)['cpm'].mean().reset_index().groupby('age_group', observed=True)['cpm'].mean()
        age_bins_views_week_mean = merged_df.groupby(['age_group', 'week'], observed=True).size().reset_index(name='size').groupby('age_group', observed=True)['size'].mean()
        cpv = age_bins_views_week_mean / age_bins_cpm_week_mean
        cpv = cpv.fillna(0)

        new_features["age_bins_cpv_week_mean"].append(cpv.mean())
    
    return pd.DataFrame(new_features)


Функция для подсчета метрики

In [95]:
def get_smoothed_log_mape_column_value(responses_column, answers_column, epsilon):
    return np.abs(np.log(
        (responses_column + epsilon)
        / (answers_column + epsilon)
    )).mean()


def get_smoothed_mean_log_accuracy_ratio_np(answers, responses, epsilon=0.005):
    answers = answers.to_numpy().reshape(len(responses), 3)
    log_accuracy_ratios = np.array(
        [
            get_smoothed_log_mape_column_value(responses[:, 0], answers[:, 0], epsilon),
            get_smoothed_log_mape_column_value(responses[:, 1], answers[:, 1], epsilon),
            get_smoothed_log_mape_column_value(responses[:, 2], answers[:, 2], epsilon),
        ]
    )
    
    percentage_error = 100 * (np.exp(log_accuracy_ratios.mean()) - 1)

    return percentage_error.round(decimals=2), [round(100 * (np.exp(log_accuracy_ratio) - 1), 2) for log_accuracy_ratio in log_accuracy_ratios]

Функции для визуализации

In [11]:
def get_shap_values(model, X_test, y_test, save_dir_path=None):
    # Для каждой целевой переменной
    for idx, target_name in enumerate(y_test.columns):
        # Создадим экземпляр класса shap
        explainer = shap.Explainer(model.estimators_[idx])
        # Получим shap значений
        shap_values = explainer(X_test)
        # Отрисуем необходимые диаграммы
        plt.figure(figsize=(8, 6)) 
        shap.plots.bar(shap_values, show=False, max_display=len(X_test.columns))
        plt.ylim(0.5, X_test.shape[1] + 1)
        if save_dir_path:
            plt.savefig(os.path.join(save_dir_path, f"barplot_{target_name}.png"), dpi=300, bbox_inches="tight")

        plt.close()

        plt.figure(figsize=(8, 6)) 
        shap.plots.beeswarm(shap_values, show=False, max_display=len(X_test.columns))
        plt.ylim(0.5, X_test.shape[1] + 1)
        if save_dir_path:
            plt.savefig(os.path.join(save_dir_path, f"beeswarm_{target_name}.png"), dpi=300, bbox_inches="tight")

        plt.close()

Функция для кросс-валидации

In [12]:
def ccross_val(model, X, y, verbose=True):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = {
        "train_times": [],
        "inference_times": [],
        "metrics": []
    }
    for train_idx, test_idx in cv.split(X):
        # Обучаем модель
        start = time.time()

        model.fit(X.iloc[train_idx], y.iloc[train_idx])

        train_time = time.time() - start
        # Получаем предсказания
        start = time.time()

        y_pred = model.predict(X.iloc[test_idx])

        inference_time = time.time() - start
        # Делаем постобработку
        y_pred[y_pred < 0.] = 0.
        # Считаем метрику
        metric = get_smoothed_mean_log_accuracy_ratio_np(y.iloc[test_idx], y_pred)

        scores["train_times"].append(train_time)
        scores["inference_times"].append(inference_time)
        scores["metrics"].append(metric)

    if verbose:
        print(f"Train batch size: {len(train_idx)}\nTest batch size: {len(test_idx)}")
        print(f"Mean train time: {round(np.mean(scores['train_times']), 2)} s")
        print(f"Mean inference time: {round(np.mean(scores['inference_times']), 2)} s")
        print(f"Mean metric: {round(np.mean(scores['metrics']), 2)}")

# Load data

### load

In [16]:
ext_user_ds_path = os.path.join(OUTPUT_DATA_PATH, "final/ext_users.csv")
ext_history_ds_path = os.path.join(OUTPUT_DATA_PATH, "final/ext_history.csv")
validate_ds_path = os.path.join(INPUT_DATA_PATH, "validate.tsv")
validate_answers_ds_path = os.path.join(INPUT_DATA_PATH, "validate_answers.tsv")

ext_user_ds = pd.read_csv(ext_user_ds_path)
ext_history_ds = pd.read_csv(ext_history_ds_path)
validate_ds = pd.read_csv(validate_ds_path, sep="\t")
validate_answers_ds = pd.read_csv(validate_answers_ds_path, sep="\t")

### preprocess

Считаем

In [24]:
ct = make_column_transformer(
    (FunctionTransformer(str_to_list), ["publishers", "user_ids"]),

    remainder='passthrough',
    verbose_feature_names_out=False
)

fe_ct = ColumnTransformer([
    ("add_num_publishers",
     FunctionTransformer(func=add_num_publishers,
                         feature_names_out=lambda *_: ["num_publishers"]),
     ["publishers"]),

    #  ("add_used_publishers_ratio",
    #  FunctionTransformer(func=add_used_publishers_ratio,
    #                      feature_names_out=lambda *_: ["used_publishers_ration"],
    #                      kw_args={"history": ext_history_ds}),
    #  ["user_ids", "publishers"]),

    #  ("add_top_intervals",
    #  FunctionTransformer(func=add_top_intervals,
    #                      feature_names_out=lambda *_: ["top_weekday", "top_daytime", "top_weekday_on_pubs", "top_daytime_on_pubs"],
    #                      kw_args={"history": ext_history_ds}),
    # ["user_ids", "publishers"]),

    ("add_session_features",
     FunctionTransformer(func=add_session_features,
                         feature_names_out=lambda *_: ["sessions_duration_on_pubs", "num_sessions_on_pubs", "sessions_cpm_on_pubs"],
                         kw_args={"history": ext_history_ds}),
    ["user_ids", "publishers"]),
     
    ("add_views",
     FunctionTransformer(func=add_views,
                         feature_names_out=lambda *_: ["num_views", "num_views_on_pubs"],
                         kw_args={"history": ext_history_ds}),
    ["user_ids", "publishers"]),

    ("add_times",
     FunctionTransformer(func=add_times,
                         feature_names_out=lambda *_: ["duration"]),
    ["hour_start", "hour_end"]),
    
    ("add_publishers_data",
     FunctionTransformer(func=add_publishers_data,
                         feature_names_out=lambda *_: ["publishers_mean_cpm_bid", "publishers_IQR_cpm_bid"],
                         kw_args={"history": ext_history_ds}),
     ["user_ids", "publishers"]),

    ("add_users_data",
     FunctionTransformer(func=add_users_data,
                         feature_names_out=lambda *_: ["views_mean_week_male", "views_mean_week_female", "age_bins_cpv_week_mean"],
                         kw_args={"users": ext_user_ds, "history": ext_history_ds}),
     ["user_ids", "publishers"])
    ],
    remainder='passthrough',
    n_jobs=12,
    verbose=True,
    verbose_feature_names_out=False
)

In [None]:
# validate_ds = pd.read_csv(validate_ds_path, sep="\t")
# ext_history_ds = pd.read_csv(ext_history_ds_path)
# ext_history_ds["week"] = ext_history_ds["hour"]//24//7

# fe_ct2 = ColumnTransformer([
#     ("add_users_data",
#      FunctionTransformer(func=add_users_data,
#                          feature_names_out=lambda *_: ["views_mean_week_male", "views_mean_week_female", "age_bins_cpv_week_mean"],
#                          kw_args={"users": ext_user_ds, "history": ext_history_ds}),
#      ["user_ids", "publishers"])
#      ],
#     remainder='passthrough',
#     n_jobs=6,
#     verbose=True,
#     verbose_feature_names_out=False
# )

# validate_ds2 = ct.fit_transform(validate_ds)
# validate_ds3 = fe_ct2.fit_transform(validate_ds2)
# validate_ds3.head()

[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing add_users_data, total=  34.3s


Unnamed: 0,views_mean_week_male,views_mean_week_female,age_bins_cpv_week_mean,cpm,hour_start,hour_end,audience_size
0,251.111111,259.777778,0.436576,220.0,1058,1153,1906
1,703.444444,515.111111,1.447312,312.0,1295,1301,1380
2,2164.666667,1689.888889,4.101722,70.0,1229,1249,888
3,490.333333,637.0,1.239636,240.0,1295,1377,440
4,3107.555556,1621.777778,20.15534,262.0,752,990,1476


In [25]:
ext_validate_ds = ct.fit_transform(validate_ds)
ext_validate_ds = fe_ct.fit_transform(ext_validate_ds)
ext_validate_ds.head()

[ColumnTransformer]  (1 of 7) Processing add_num_publishers, total=   0.0s
[ColumnTransformer] ..... (4 of 7) Processing add_times, total=   0.0s
[ColumnTransformer] ..... (7 of 7) Processing remainder, total=   0.0s
[ColumnTransformer]  (5 of 7) Processing add_publishers_data, total=  28.9s
[ColumnTransformer] ..... (3 of 7) Processing add_views, total=  39.2s
[ColumnTransformer]  (6 of 7) Processing add_users_data, total=  40.0s
[ColumnTransformer]  (2 of 7) Processing add_session_features, total= 4.8min


Unnamed: 0,num_publishers,sessions_duration_on_pubs,num_sessions_on_pubs,sessions_cpm_on_pubs,num_views,num_views_on_pubs,duration,publishers_mean_cpm_bid,publishers_IQR_cpm_bid,views_mean_week_male,views_mean_week_female,age_bins_cpv_week_mean,cpm,audience_size
0,2,2,2661,304.767717,77321,4598,95,253.183282,197.415,251.111111,259.777778,0.436608,220.0,1906
1,2,3,3922,189.190416,178007,10967,6,173.083709,151.78,703.444444,515.111111,1.416218,312.0,1380
2,6,2,16165,232.619346,39052,34691,20,197.274887,150.0,2164.666667,1689.888889,4.104932,70.0,888
3,2,2,5054,224.737393,17013,10146,82,195.917368,160.855,490.333333,637.0,1.216811,240.0,440
4,4,2,20591,272.764926,52434,42564,238,234.729084,188.665,3107.555556,1621.777778,20.15534,262.0,1476


In [32]:
ext_validate_ds.to_csv(os.path.join(OUTPUT_DATA_PATH, "final/ext_validate1.csv"), index=False)
ext_validate_ds.to_csv(os.path.join(OUTPUT_DATA_PATH, "final/ext_validate1.tsv"), sep="\t", index=False)

Берем готовое

In [None]:
ext_history_ds = pd.read_csv(ext_history_ds_path)
ext_history_ds = ext_history_ds[ext_history_ds['cpm'] < ext_history_ds['cpm'].quantile(0.95)]
ext_history_ds.head()

Unnamed: 0,hour,cpm,publisher,user_id,weekday,daytime,week
0,10,30.00,1,15661,0,10,0
1,8,41.26,1,8444,0,8,0
2,7,360.00,1,15821,0,7,0
3,18,370.00,1,21530,0,18,0
4,8,195.00,2,22148,0,8,0
...,...,...,...,...,...,...,...
1147852,382,45.00,1,7871,1,22,2
1147853,360,33.35,1,7992,1,0,2
1147854,381,205.00,1,21516,1,21,2
1147855,383,37.50,1,8224,1,23,2


In [18]:
ext_validate_ds_path = os.path.join(OUTPUT_DATA_PATH, "ext_validate2.csv")
ext_validate_ds = pd.read_csv(ext_validate_ds_path)
ext_validate_ds.head()

Unnamed: 0,num_publishers,used_publishers_ration,top_weekday,top_daytime,top_weekday_on_pubs,top_daytime_on_pubs,sessions_duration,sessions_duration_on_pubs,num_sessions,num_sessions_on_pubs,sessions_cpm,sessions_cpm_on_pubs,num_views,num_views_on_pubs,hour_start,hour_end,weekday_start,weekday_end,daytime_start,daytime_end,duration,publishers_mean_cpm_bid,publishers_IQR_cpm_bid,cpm,audience_size
0,2,0.5,-0.900969,0.258819,1.0,-0.965926,2,2,35269,2392,152.900209,169.738951,73168,4195,1058,1153,2,6,2,1,95,155.743058,154.96,220.0,1906
1,2,0.5,-0.222521,0.707107,0.62349,0.5,3,3,68553,3792,143.340792,144.089259,171390,10461,1295,1301,4,5,23,5,6,133.912896,130.0,312.0,1380
2,6,0.166667,-0.900969,0.5,0.62349,0.5,2,2,17092,15352,153.585641,155.48393,37121,33025,1229,1249,2,3,5,1,20,141.745454,134.16,70.0,888
3,2,0.5,-0.222521,-0.965926,-0.222521,0.258819,2,2,7847,4739,149.736174,149.314399,16108,9537,1295,1377,4,1,23,9,82,137.367787,140.24,240.0,440
4,4,0.25,-0.222521,0.5,1.0,0.5,2,2,23326,18976,172.669745,175.835427,48440,39236,752,990,3,6,8,6,238,160.046668,151.625,262.0,1476


Если хотим посчитать не доли, а количества

In [None]:
q_validate_answers_ds = validate_answers_ds.copy()
q_validate_answers_ds["at_least_one"] = q_validate_answers_ds["at_least_one"]*ext_validate_ds["audience_size"]
q_validate_answers_ds["at_least_two"] = q_validate_answers_ds["at_least_two"]*ext_validate_ds["audience_size"]
q_validate_answers_ds["at_least_three"] = q_validate_answers_ds["at_least_three"]*ext_validate_ds["audience_size"]

# Train model

## XGBRegression

### setup

In [79]:
with open(os.path.join(OUTPUT_DATA_PATH, "final/xgbr_cek_full.pkl"), "rb") as f:
    MODEL = load(f)

In [72]:
xgb = XGBRegressor(
    device="cuda",
    random_state=42,

    n_estimators=1000,
    learning_rate=0.05,
    gamma=0,
    max_depth=6,
    reg_lambda=1,
    tree_method="hist",

    objective="reg:gamma"
)

model = MultiOutputRegressor(
    estimator=xgb,
    n_jobs=8
)

In [88]:
# Доли
X_train, X_test, y_train, y_test = train_test_split(ext_validate_ds, validate_answers_ds, test_size=0.2, random_state=42)

# Кол-ва
# X_train, X_test, y_train, y_test = train_test_split(ext_validate_ds, q_validate_answers_ds, test_size=0.2, random_state=42)

### train

Обучение модели

In [89]:
model.fit(X_train, y_train+0.0005)

In [75]:
with open(os.path.join(OUTPUT_DATA_PATH, "final/xgbr_cek_full.pkl"), "wb") as f:
    dump(model, f, protocol=5)

Подбор гиперпараметров модели

In [None]:
# Гиперпараметры для поиска
param_dist = {
    "estimator__n_estimators": [100, 300, 500, 1000],
    # "estimator__learning_rate": [0.005, 0.01, 0.05, 0.1],
    # "estimator__max_depth": [3, 5, 6, 8, 10],
    # "estimator__gamma": [0, 0.1, 0.2, 0.5],
    # "estimator__reg_lambda": [0.1, 1, 10],
    # "estimator__subsample": [0.6, 0.8, 1.0],
    # "estimator__colsample_bytree": [0.6, 0.8, 1.0],
    "estimator__objective": ["reg:logistic", "reg:squarederror", "reg:pseudohubererror", "reg:gamma"]
}


from sklearn.metrics import mean_squared_error
smlar = make_scorer(get_smoothed_mean_log_accuracy_ratio_np, greater_is_better=False, epsilon=0.005)
# Настройка HalvingRandomSearchCV
search = HalvingRandomSearchCV(
    estimator=model,
    param_distributions=param_dist,
    factor=3,
    resource="n_samples",
    max_resources=1000,
    cv=3,
    scoring=mean_squared_error,
    return_train_score=True,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# Запуск поиска
search.fit(X_train, y_train)

In [None]:
# Вывод лучших параметров
print("Best parameters:", search.best_params_)
print("Best score:", search.best_score_)

# Оценка на тесте
best_model = search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test score:", test_score)

### evaluate

In [96]:
y_pred = model.predict(X_test)
y_pred[y_pred < 0.] = 0.
smlar_mean, smlar = get_smoothed_mean_log_accuracy_ratio_np(y_test, y_pred)
print(smlar_mean, smlar)

28.74 [34.77, 27.47, 24.21]


In [66]:
# Доли
ccross_val(model, ext_validate_ds, validate_answers_ds+0.0005)

# Кол-ва
# ccross_val(model, ext_validate_ds, q_validate_answers_ds+0.0005)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Train batch size: 807
Test batch size: 201
Mean train time: 12.31 s
Mean inference time: 0.38 s
Mean metric: 24.69


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [31]:
save_dir_path = os.path.join(OUTPUT_DATA_PATH, "plots/shap4")
os.makedirs(save_dir_path, exist_ok=True)
get_shap_values(model, X_test, y_test, save_dir_path)

In [82]:
pd.DataFrame(MODEL.predict(X_test), columns=y_test.columns).head(10)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Unnamed: 0,at_least_one,at_least_two,at_least_three
0,0.014291,0.0005,0.000501
1,0.027759,0.000499,0.0005
2,0.028271,0.009044,0.00319
3,0.030876,0.021777,0.014137
4,0.051112,0.016406,0.004482
5,0.587958,0.459326,0.379574
6,0.049186,0.017596,0.004165
7,0.026079,0.000499,0.0005
8,0.027095,0.010248,0.006086
9,0.495688,0.421957,0.385774


In [85]:
y_test.head(10).reset_index(drop=True)

Unnamed: 0,at_least_one,at_least_two,at_least_three
0,0.0138,0.0,0.0
1,0.0272,0.0,0.0
2,0.0279,0.0086,0.0027
3,0.0305,0.0213,0.0137
4,0.0507,0.016,0.004
5,0.5879,0.4592,0.3787
6,0.0485,0.017,0.0036
7,0.0256,0.0,0.0
8,0.0267,0.0098,0.0056
9,0.4944,0.4222,0.3847
