# Загрузка данных с кагла

Перед этим необходимо скопировать ключ со своего акка

In [16]:
!pip install -q kaggle

from google.colab import files
files.upload()


!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c hse-rec-sys-challenge-2024

!unzip hse-rec-sys-challenge-2024.zip

Saving kaggle.json to kaggle (1).json
hse-rec-sys-challenge-2024.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  hse-rec-sys-challenge-2024.zip
replace events.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: events.csv              
replace item_features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: item_features.csv       
replace submission_sample.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: submission_sample.csv   
replace user_features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: user_features.csv       


Ставим пакеты

In [17]:
!pip install lightgbm
!pip install catboost
!pip install implicit
!pip install optuna



В нашем случае будем использовать ALS - как baseline

Catboost и LGBM - наши основные модели

Код разделил на несколько ячеек где реализовал все необходимые функции для работы. Какого-то строгого порядка нет, точнее он был, но из-за постоянной редакции очередность функций сбилась. Это не особо важно, но красоты стало меньше

In [18]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from scipy.sparse import csr_matrix
from typing import Tuple
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares
from lightgbm import LGBMClassifier
import optuna

# Загрузка данных
events = pd.read_csv('events.csv')
user = pd.read_csv('user_features.csv')
item = pd.read_csv('item_features.csv')

# Преобразование временных данных в числовой формат
def convert_timedelta_to_numeric(df):
    for col in df.select_dtypes(include=['timedelta64[ns]']).columns:
        df[col] = df[col].dt.total_seconds()  # Преобразование в секунды
    return df

# Функция для оптимизации гиперпараметров с использованием Optuna
def optimize_lightgbm(X_train, y_train):
    def objective(trial):
        # Гиперпараметры, которые будут оптимизироваться
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 4, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        }

        # Обучение модели с текущими параметрами
        model = LGBMClassifier(**param)
        model.fit(X_train, y_train)
        # Оценка точности модели (например, по метрике AUC)
        score = model.score(X_train, y_train)
        return score

    # Создание и запуск исследования Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)

    return study.best_params

# LightGBM model
def fit_lightgbm(
    candidates: pd.DataFrame,
    test_events: pd.DataFrame,
    events: pd.DataFrame
) -> LGBMClassifier:
    print('Labeling candidates for LightGBM')
    candidates_labeled = (
        candidates
        .merge(
            (
                test_events[['user_id', 'item_id']]
                .assign(label=1)
            ),
            on=['user_id', 'item_id'],
            how='left',
        )
    )

    candidates_labeled['label'] = (
        candidates_labeled['label']
        .fillna(0)
        .astype('int32')
    )

    positive_classes_rate = candidates_labeled['label'].sum() / len(candidates_labeled)
    print(f'Positive classes for LightGBM = {positive_classes_rate * 100:.2f}%')

    print('Extracting features for candidates for LightGBM')
    lgb_features = enrich_interactions(candidates_labeled, events)
    del candidates_labeled

    # Преобразование временных данных в числовой формат
    lgb_features = convert_timedelta_to_numeric(lgb_features)

    X_train = lgb_features.drop(columns=['user_id', 'item_id', 'label'])
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_train[col].fillna(0, inplace=True)

    y_train = lgb_features['label']
    del lgb_features

    print('Training LightGBM')
    best_params = optimize_lightgbm(X_train, y_train)
    best_params['class_weight'] = {0: positive_classes_rate, 1: 1 - positive_classes_rate}
    best_params['metric'] = 'binary_logloss'

    lgb_model = LGBMClassifier(**best_params)

    lgb_model.fit(X_train, y_train)

    return lgb_model


# предикт LightGBM model с проверкой данных
def predict_lightgbm(
    candidates: pd.DataFrame,
    events: pd.DataFrame,
    lgb_model: LGBMClassifier
) -> pd.DataFrame:
    interactions_featurized = enrich_interactions(candidates, events)

    print('Running LightGBM scoring')
    interactions = interactions_featurized[['user_id', 'item_id']].copy()
    features = interactions_featurized.drop(columns=['user_id', 'item_id'])

    # Преобразование временных данных в числовой формат
    features = convert_timedelta_to_numeric(features)

    for col in features.columns:
        if features[col].dtype == 'object':
            features[col] = pd.to_numeric(features[col], errors='coerce')
            features[col].fillna(0, inplace=True)

    scores = lgb_model.predict_proba(features)[:, 1].flatten()
    interactions['lightgbm_score'] = scores
    return interactions


Шаманим с данными и ALS

In [19]:
# Объединение метаинформации о фильмах с транзакциями
def add_meta_to_events(events, item):
    events_with_meta = (
        events
        .merge(item,
               on='item_id', how='left')
    )
    # Пример вычисления отношения просмотров
    events_with_meta['watched_ratio'] = events_with_meta['rating'] / 5.0

    def score_event(row):
        score = 1
        if row['rating'] >= 3:  # Пример: учитываем рейтинг от 3 и выше
            score = int(row['rating'] * 2)
        return min(score, 10)

    events_with_meta['score'] = events_with_meta.apply(score_event, axis=1)
    return events_with_meta

# Добавление рейтингов к событиям
def add_ratings_to_events(events):
    events['score'].fillna(events['rating'], inplace=True)
    return events

# Кодирование TF-IDF
def encode_tfidf_coo(events: pd.DataFrame) -> pd.DataFrame:
    score_sum_per_user = (
        events
        .groupby('user_id')['score']
        .transform('sum')
    )
    user_count_per_item = (
        events
        .groupby('item_id')['user_id']
        .transform('size')
    )
    tf = events['score'].values / score_sum_per_user.values
    idf = np.log(len(score_sum_per_user) / user_count_per_item.values)

    tfidf = events[['user_id', 'item_id']].copy()
    tfidf['value'] = tf * idf

    return tfidf

# Преобразование данных в формат TF-IDF для ALS
def encode_tfidf(events: pd.DataFrame) -> Tuple[LabelEncoder, LabelEncoder, csr_matrix]:
    tfidf = encode_tfidf_coo(events)

    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()

    user_index = user_encoder.fit_transform(events['user_id'].values)
    item_index = item_encoder.fit_transform(events['item_id'].values)

    tfidf_csr = csr_matrix(
        (
            tfidf['value'].astype('float32').values,
            (user_index, item_index)
        ),
        shape=(len(user_encoder.classes_), len(item_encoder.classes_))
    )

    return user_encoder, item_encoder, tfidf_csr

# Функция для запуска ALS
def als_fit_predict(events_csr: csr_matrix):
    als = AlternatingLeastSquares(factors=128, iterations=30, alpha=40.0, calculate_training_loss=True)
    als.fit(events_csr)

    recommendations_matrix, recommendations_scores = als.recommend(
        np.arange(0, events_csr.shape[0]),
        events_csr,
        N=10,
        filter_already_liked_items=True
    )

    return recommendations_matrix, recommendations_scores

# Преобразование рекомендаций в DataFrame
def als_recommendations_to_df(
    recommendations_matrix: np.ndarray,
    recommendations_scores: np.ndarray,
    user_encoder: LabelEncoder,
    item_encoder: LabelEncoder,
    user_key = 'user_id',
    item_key = 'item_id'
) -> pd.DataFrame:
    recommendations_indices = pd.DataFrame({
        'user_index': np.arange(0, len(recommendations_matrix)),
        'item_index': list(recommendations_matrix),
        'score': list(recommendations_scores),
    })

    user_mapping = pd.DataFrame({
        'user_index': np.arange(0, len(user_encoder.classes_)),
        user_key: user_encoder.classes_,
    })

    item_mapping = pd.DataFrame({
        'item_index': np.arange(0, len(item_encoder.classes_)),
        item_key: item_encoder.classes_,
    })

    recommendations = (
        recommendations_indices
        .merge(
            user_mapping,
            on='user_index',
            how='left',
        )
        .drop(columns=['user_index'])
        .explode(['item_index', 'score'], ignore_index=True)
        .merge(
            item_mapping,
            on='item_index',
            how='left',
        )
        .drop(columns=['item_index'])
    )

    return recommendations

# Главная функция для запуска ALS
def run_als(events: pd.DataFrame, item: pd.DataFrame) -> pd.DataFrame:
    print('Preprocess events')
    events = add_meta_to_events(events, item)
    events = add_ratings_to_events(events)

    print('Compute TF-IDF')
    user_encoder, item_encoder, events_csr = encode_tfidf(events)

    print('Run ALS')
    recommendations_item_indices, recommendations_scores = als_fit_predict(events_csr)

    print('Postprocess ALS prediction')
    recommendations = als_recommendations_to_df(
        recommendations_item_indices,
        recommendations_scores,
        user_encoder,
        item_encoder,
        user_key='user_id',
        item_key='item_id',
    )

    return recommendations

# Выполнение рекомендаций
recommendations = run_als(events, item)
print(recommendations.head())

Preprocess events


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  events['score'].fillna(events['rating'], inplace=True)


Compute TF-IDF
Run ALS


  0%|          | 0/30 [00:00<?, ?it/s]

Postprocess ALS prediction
      score  user_id  item_id
0  0.990891        0     1811
1  0.972504        0     2256
2  0.741381        0     1545
3  0.722323        0      331
4   0.71254        0     2732


Еще несколько функций для обработки данных

In [20]:
# Функция для объединения новых фич с item_features
def merge_item_feature(item_features, new_feature):
    item_features = item_features.merge(
        new_feature,
        on='item_id',
        how='left'
    )
    return item_features

# Функция для объединения новых фич с user_features
def merge_user_feature(user_features, new_feature):
    user_features = user_features.merge(
        new_feature,
        on='user_id',
        how='left'
    )
    return user_features

# Добавление фичи популярности фильмов
def add_item_popularity_feature(item_features: pd.DataFrame, events: pd.DataFrame):
    item_occurrences = (
        events
        .groupby('item_id')
        .size()
        .reset_index(name='item_occurrences')
    )

    item_occurrences['item_popularity'] = (
        item_occurrences['item_occurrences'] /
        events['item_id'].nunique()
    )
    item_occurrences.drop(columns=['item_occurrences'], inplace=True)

    return merge_item_feature(item_features, item_occurrences)

# Добавление фичи числа закладок на фильмы (если применимо)
def add_item_bookmark_count_feature(item_features: pd.DataFrame, events: pd.DataFrame):
    bookmarks_per_item = (
        events
        .groupby('item_id')
        .size()
        .reset_index(name='item_bookmark_count')
    )

    item_features = merge_item_feature(item_features, bookmarks_per_item)
    item_features['item_bookmark_count'].fillna(0, inplace=True)

    return item_features

# Добавление фичи количества просмотров для пользователя
def add_user_watch_count_feature(user_features: pd.DataFrame, events: pd.DataFrame):
    user_watch_count = (
        events
        .groupby('user_id')
        .size()
        .reset_index(name='user_watch_count')
    )
    return merge_user_feature(user_features, user_watch_count)

# Добавление фичи стандартного отклонения времени просмотра для пользователя
def add_user_watch_time_std(user_features: pd.DataFrame, events: pd.DataFrame):
    user_watch_time_std = (
        events
        .groupby('user_id')['timestamp']
        .std()
        .reset_index(name='user_watch_time_std')
    )
    return merge_user_feature(user_features, user_watch_time_std)

# Генерация фич для фильмов
def generate_item_features(events: pd.DataFrame) -> pd.DataFrame:
    item_features = pd.DataFrame({
        'item_id': events['item_id'].unique()
    })
    item_features = add_item_popularity_feature(item_features, events)
    item_features = add_item_bookmark_count_feature(item_features, events)
    return item_features

# Убедитесь, что timestamp имеет тип datetime
events['timestamp'] = pd.to_datetime(events['timestamp'], errors='coerce')

# Добавление фичи количества просмотров за последнюю неделю для пользователей
def add_recent_watch_count(user_features: pd.DataFrame, events: pd.DataFrame):
    current_time = pd.Timestamp.now()  # Текущее время
    recent_watches = (
        events
        .loc[events['timestamp'] >= current_time - pd.Timedelta(weeks=1)]  # Изменение с использованием loc
        .groupby('user_id')
        .size()
        .reset_index(name='recent_watch_count')
    )
    user_features = merge_user_feature(user_features, recent_watches)
    user_features['recent_watch_count'].fillna(0, inplace=True)
    return user_features
# Добавление фичи средней оценки пользователя
def add_user_avg_rating(user_features: pd.DataFrame, events: pd.DataFrame):
    avg_rating = (
        events
        .groupby('user_id')['rating']
        .mean()
        .reset_index(name='user_avg_rating')
    )
    user_features = merge_user_feature(user_features, avg_rating)
    user_features['user_avg_rating'].fillna(0, inplace=True)
    return user_features

# Добавление этих фич в генерацию фич для пользователей
def generate_user_features(events: pd.DataFrame) -> pd.DataFrame:
    user_features = pd.DataFrame({
        'user_id': events['user_id'].unique()
    })
    user_features = add_user_watch_count_feature(user_features, events)
    user_features = add_user_watch_time_std(user_features, events)
    user_features = add_recent_watch_count(user_features, events)  # Новая фича
    user_features = add_user_avg_rating(user_features, events)  # Новая фича
    return user_features

# Обогащение взаимодействий (интеракций) фичами
def enrich_interactions(
    interactions: pd.DataFrame,
    events: pd.DataFrame
) -> pd.DataFrame:
    print('Generating item features')
    item_features = generate_item_features(events)

    print('Generating user features')
    user_features = generate_user_features(events)

    print('Merging features to interactions')
    interactions_featurized = (
        interactions
        .merge(
            item_features,
            on='item_id',
            how='left'
        )
        .merge(
            user_features,
            on='user_id',
            how='left'
        )
    )

    return interactions_featurized

# Функция для оценки интеракций с использованием CatBoost
def score_interactions(
    interactions_featurized: pd.DataFrame,
    catboost_model: CatBoostClassifier,
) -> pd.DataFrame:
    interactions = interactions_featurized[['user_id', 'item_id']].copy()
    features = interactions_featurized.drop(columns=['user_id', 'item_id'])
    scores = catboost_model.predict_proba(features)[:, 1].flatten()
    interactions['catboost_score'] = scores
    return interactions

Переходим к catboost

In [21]:


# Обучение модели CatBoost
def fit_catboost(
    candidates: pd.DataFrame,
    test_events: pd.DataFrame,
    events: pd.DataFrame
):
    print('Labeling candidates')
    candidates_labeled = (
        candidates
        .merge(
            (
                test_events[['user_id', 'item_id']]
                .assign(label=1)
            ),
            on=['user_id', 'item_id'],
            how='left',
        )
    )

    candidates_labeled['label'] = (
        candidates_labeled['label']
        .fillna(0)
        .astype('int32')
    )

    positive_classes_rate = candidates_labeled['label'].sum() / len(candidates_labeled)
    print(f'Positive classes = {positive_classes_rate * 100:.2f}%')

    print('Extracting features for candidates')
    cb_features = enrich_interactions(candidates_labeled, events)
    del candidates_labeled

    cb_pool = Pool(
        cb_features.drop(columns=['user_id', 'item_id', 'label']),
        cb_features['label']
    )
    del cb_features

    print('Training CatBoost')
    cb_cls = CatBoostClassifier(
        iterations=100,
        class_weights=[positive_classes_rate, 1 - positive_classes_rate],
        eval_metric='BalancedAccuracy',
        objective='Logloss'
    )

    cb_cls.fit(cb_pool, verbose=1)

    return cb_cls

# Предсказание с использованием обученной модели CatBoost
def predict_catboost(
    candidates: pd.DataFrame,
    events: pd.DataFrame,
    catboost_model: CatBoostClassifier
) -> pd.DataFrame:
    interactions_featurized = enrich_interactions(candidates, events)

    print('Running CatBoost scoring')
    scored_interactions = score_interactions(interactions_featurized, catboost_model)

    return scored_interactions


Переходим к обучению

In [22]:

# Добавляем ранжирование событий для каждого пользователя по времени
events['split_rank_per_user'] = (
    events
    .groupby('user_id')['timestamp']
    .rank('first', ascending=False)
    .astype('int32')
)

# Разделение данных на тренировочный и тестовый наборы для ALS
als_events_train = (
    events
    .query('split_rank_per_user > 2')
    .drop(columns=['split_rank_per_user'])
)

als_events_test = (
    events
    .query('split_rank_per_user <= 2')
    .drop(columns=['split_rank_per_user'])
)[["item_id", "user_id"]]

# Запуск ALS для создания кандидатов для обучения модели CatBoost
als_candidates_to_train_catboost = run_als(als_events_train, item)

# Обучение модели CatBoost с использованием кандидатов, сгенерированных ALS
catboost_model = fit_catboost(
    als_candidates_to_train_catboost,
    als_events_test,
    events
)

# Training LightGBM
lightgbm_model = fit_lightgbm(
    als_candidates_to_train_catboost,
    als_events_test,
    events
)

# Запуск ALS на всех данных для генерации кандидатов
als_candidates = run_als(events, item)

# Prediction with CatBoost
catboost_prediction_full = predict_catboost(als_candidates, events, catboost_model)

# Prediction with LightGBM
lightgbm_prediction_full = predict_lightgbm(als_candidates, events, lightgbm_model)

# Combining predictions
combined_predictions = (
    catboost_prediction_full[['user_id', 'item_id', 'catboost_score']]
    .merge(
        lightgbm_prediction_full[['user_id', 'item_id', 'lightgbm_score']],
        on=['user_id', 'item_id'],
        how='inner'
    )
)

# Ensemble by averaging the scores
combined_predictions['final_score'] = (
    combined_predictions['catboost_score'] * 0.5 +
    combined_predictions['lightgbm_score'] * 0.5
)

# Select the top 10 recommendations per user based on the final ensemble score
final_recommendations = (
    combined_predictions
    .sort_values('final_score', ascending=False, ignore_index=True)
    .groupby('user_id')
    .head(10)
)

Preprocess events


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  events['score'].fillna(events['rating'], inplace=True)


Compute TF-IDF
Run ALS


  0%|          | 0/30 [00:00<?, ?it/s]

Postprocess ALS prediction
Labeling candidates
Positive classes = 1.45%
Extracting features for candidates
Generating item features
Generating user features
Merging features to interactions


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  item_features['item_bookmark_count'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  user_features['recent_watch_count'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

Training CatBoost
Learning rate set to 0.490272
0:	learn: 0.6448712	total: 9ms	remaining: 891ms
1:	learn: 0.6550154	total: 15.8ms	remaining: 773ms
2:	learn: 0.6594212	total: 22.6ms	remaining: 730ms
3:	learn: 0.6637732	total: 28.9ms	remaining: 694ms
4:	learn: 0.6669034	total: 36.1ms	remaining: 686ms
5:	learn: 0.6706427	total: 41.9ms	remaining: 657ms
6:	learn: 0.6723709	total: 48.1ms	remaining: 639ms
7:	learn: 0.6742945	total: 54.3ms	remaining: 624ms
8:	learn: 0.6770661	total: 60.9ms	remaining: 615ms
9:	learn: 0.6795008	total: 70.3ms	remaining: 633ms
10:	learn: 0.6827075	total: 78.3ms	remaining: 633ms
11:	learn: 0.6818524	total: 85.2ms	remaining: 625ms
12:	learn: 0.6883926	total: 103ms	remaining: 690ms
13:	learn: 0.6908110	total: 109ms	remaining: 670ms
14:	learn: 0.6970391	total: 116ms	remaining: 656ms
15:	learn: 0.6994721	total: 122ms	remaining: 641ms
16:	learn: 0.7021656	total: 130ms	remaining: 636ms
17:	learn: 0.7047183	total: 136ms	remaining: 622ms
18:	learn: 0.7065482	total: 143ms	r

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  item_features['item_bookmark_count'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  user_features['recent_watch_count'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

Generating user features
Merging features to interactions
Training LightGBM
[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:06,436] Trial 0 finished with value: 0.9854966887417218 and parameters: {'n_estimators': 109, 'max_depth': 5, 'learning_rate': 0.024524950218597054, 'num_leaves': 96, 'min_child_samples': 12, 'subsample': 0.6439710131662767, 'colsample_bytree': 0.8512253456136183}. Best is trial 0 with value: 0.9854966887417218.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:07,192] Trial 1 finished with value: 0.9854966887417218 and parameters: {'n_estimators': 103, 'max_depth': 6, 'learning_rate': 0.017415228949531056, 'num_leaves': 96, 'min_child_samples': 21, 'subsample': 0.6637171934281979, 'colsample_bytree': 0.6459163548031435}. Best is trial 0 with value: 0.9854966887417218.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:08,048] Trial 2 finished with value: 0.9855132450331126 and parameters: {'n_estimators': 136, 'max_depth': 9, 'learning_rate': 0.030760054796375353, 'num_leaves': 54, 'min_child_samples': 29, 'subsample': 0.6934413155346986, 'colsample_bytree': 0.7888661695683745}. Best is trial 2 with value: 0.9855132450331126.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:08,994] Trial 3 finished with value: 0.9855132450331126 and parameters: {'n_estimators': 158, 'max_depth': 12, 'learning_rate': 0.021522124477455728, 'num_leaves': 61, 'min_child_samples': 16, 'subsample': 0.9073581370475525, 'colsample_bytree': 0.7313110577054731}. Best is trial 2 with value: 0.9855132450331126.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:09,800] Trial 4 finished with value: 0.9887417218543046 and parameters: {'n_estimators': 153, 'max_depth': 5, 'learning_rate': 0.2889911880643383, 'num_leaves': 27, 'min_child_samples': 18, 'subsample': 0.6750569077441041, 'colsample_bytree': 0.8151985504586552}. Best is trial 4 with value: 0.9887417218543046.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:10,501] Trial 5 finished with value: 0.9854966887417218 and parameters: {'n_estimators': 98, 'max_depth': 8, 'learning_rate': 0.014853986351551302, 'num_leaves': 72, 'min_child_samples': 17, 'subsample': 0.6852639757807222, 'colsample_bytree': 0.7868836378952342}. Best is trial 4 with value: 0.9887417218543046.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:11,746] Trial 6 finished with value: 0.9856953642384106 and parameters: {'n_estimators': 185, 'max_depth': 6, 'learning_rate': 0.05379159927309013, 'num_leaves': 64, 'min_child_samples': 21, 'subsample': 0.5277775045442208, 'colsample_bytree': 0.7642478596024374}. Best is trial 4 with value: 0.9887417218543046.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:12,930] Trial 7 finished with value: 0.9854966887417218 and parameters: {'n_estimators': 141, 'max_depth': 9, 'learning_rate': 0.013160901377327856, 'num_leaves': 87, 'min_child_samples': 18, 'subsample': 0.634503663090503, 'colsample_bytree': 0.6449130219196724}. Best is trial 4 with value: 0.9887417218543046.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:13,689] Trial 8 finished with value: 0.9854966887417218 and parameters: {'n_estimators': 83, 'max_depth': 6, 'learning_rate': 0.05447909885318983, 'num_leaves': 46, 'min_child_samples': 24, 'subsample': 0.7285191958433939, 'colsample_bytree': 0.790116726183959}. Best is trial 4 with value: 0.9887417218543046.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003838 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:14,845] Trial 9 finished with value: 0.9954470198675497 and parameters: {'n_estimators': 185, 'max_depth': 8, 'learning_rate': 0.21411572993218017, 'num_leaves': 44, 'min_child_samples': 13, 'subsample': 0.5443380314499489, 'colsample_bytree': 0.6733173041066864}. Best is trial 9 with value: 0.9954470198675497.
[I 2024-10-25 12:31:15,096] Trial 10 finished with value: 0.985976821192053 and parameters: {'n_estimators': 51, 'max_depth': 12, 'learning_rate': 0.2744309226803894, 'num_leaves': 24, 'min_child_samples': 5, 'subsample': 0.8562974402016483, 'colsample_bytree': 0.9826169789617228}. Best is trial 9 with value: 0.9954470198675497.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000994 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769
[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binar

[I 2024-10-25 12:31:16,037] Trial 11 finished with value: 0.9866059602649007 and parameters: {'n_estimators': 196, 'max_depth': 4, 'learning_rate': 0.2621577343454688, 'num_leaves': 28, 'min_child_samples': 10, 'subsample': 0.5129894725732316, 'colsample_bytree': 0.5488294536393077}. Best is trial 9 with value: 0.9954470198675497.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:16,865] Trial 12 finished with value: 0.9891887417218543 and parameters: {'n_estimators': 169, 'max_depth': 8, 'learning_rate': 0.13718568879366147, 'num_leaves': 38, 'min_child_samples': 12, 'subsample': 0.80896859083202, 'colsample_bytree': 0.9117120089280684}. Best is trial 9 with value: 0.9954470198675497.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:17,793] Trial 13 finished with value: 0.9908112582781456 and parameters: {'n_estimators': 177, 'max_depth': 8, 'learning_rate': 0.14728012086535874, 'num_leaves': 41, 'min_child_samples': 11, 'subsample': 0.8117406487367473, 'colsample_bytree': 0.926141246282339}. Best is trial 9 with value: 0.9954470198675497.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:18,646] Trial 14 finished with value: 0.9919039735099338 and parameters: {'n_estimators': 177, 'max_depth': 10, 'learning_rate': 0.14073654980981368, 'num_leaves': 42, 'min_child_samples': 7, 'subsample': 0.9704367870206019, 'colsample_bytree': 0.6821977671991496}. Best is trial 9 with value: 0.9954470198675497.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:19,662] Trial 15 finished with value: 0.9925662251655629 and parameters: {'n_estimators': 195, 'max_depth': 10, 'learning_rate': 0.12054558732060175, 'num_leaves': 48, 'min_child_samples': 5, 'subsample': 0.9957135059246962, 'colsample_bytree': 0.6744208161789839}. Best is trial 9 with value: 0.9954470198675497.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:20,962] Trial 16 finished with value: 0.991158940397351 and parameters: {'n_estimators': 195, 'max_depth': 10, 'learning_rate': 0.09175198381947018, 'num_leaves': 75, 'min_child_samples': 8, 'subsample': 0.984283959464505, 'colsample_bytree': 0.5478262106494942}. Best is trial 9 with value: 0.9954470198675497.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002744 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:22,077] Trial 17 finished with value: 0.9980132450331126 and parameters: {'n_estimators': 199, 'max_depth': 10, 'learning_rate': 0.19151115857503206, 'num_leaves': 54, 'min_child_samples': 14, 'subsample': 0.562835640155244, 'colsample_bytree': 0.6003107261333598}. Best is trial 17 with value: 0.9980132450331126.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:22,788] Trial 18 finished with value: 0.9857450331125828 and parameters: {'n_estimators': 158, 'max_depth': 11, 'learning_rate': 0.07876318731257755, 'num_leaves': 34, 'min_child_samples': 14, 'subsample': 0.58669235114959, 'colsample_bytree': 0.5007830426097011}. Best is trial 17 with value: 0.9980132450331126.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:23,614] Trial 19 finished with value: 0.9910430463576159 and parameters: {'n_estimators': 125, 'max_depth': 7, 'learning_rate': 0.2123062391688646, 'num_leaves': 54, 'min_child_samples': 14, 'subsample': 0.5773135183212138, 'colsample_bytree': 0.590880566315756}. Best is trial 17 with value: 0.9980132450331126.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:25,206] Trial 20 finished with value: 0.9989403973509934 and parameters: {'n_estimators': 199, 'max_depth': 11, 'learning_rate': 0.20487302054658751, 'num_leaves': 68, 'min_child_samples': 28, 'subsample': 0.5754557523561638, 'colsample_bytree': 0.7228307089100235}. Best is trial 20 with value: 0.9989403973509934.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:26,894] Trial 21 finished with value: 0.9992218543046357 and parameters: {'n_estimators': 198, 'max_depth': 11, 'learning_rate': 0.20210710492955167, 'num_leaves': 71, 'min_child_samples': 30, 'subsample': 0.5736084367176674, 'colsample_bytree': 0.7068876199298062}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:28,620] Trial 22 finished with value: 0.9983443708609272 and parameters: {'n_estimators': 199, 'max_depth': 11, 'learning_rate': 0.18207600048164138, 'num_leaves': 72, 'min_child_samples': 30, 'subsample': 0.5997526001363914, 'colsample_bytree': 0.7299806103728124}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:30,130] Trial 23 finished with value: 0.9890397350993377 and parameters: {'n_estimators': 168, 'max_depth': 11, 'learning_rate': 0.08521278614191356, 'num_leaves': 78, 'min_child_samples': 30, 'subsample': 0.6104803725293763, 'colsample_bytree': 0.7244244314352958}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002677 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:31,349] Trial 24 finished with value: 0.9966556291390728 and parameters: {'n_estimators': 183, 'max_depth': 11, 'learning_rate': 0.16731136216796894, 'num_leaves': 68, 'min_child_samples': 27, 'subsample': 0.5006821214449424, 'colsample_bytree': 0.7153251150057512}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000822 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:32,642] Trial 25 finished with value: 0.9929139072847682 and parameters: {'n_estimators': 168, 'max_depth': 12, 'learning_rate': 0.1036783226057307, 'num_leaves': 83, 'min_child_samples': 27, 'subsample': 0.608012859832838, 'colsample_bytree': 0.8767498699383802}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:34,323] Trial 26 finished with value: 0.989271523178808 and parameters: {'n_estimators': 200, 'max_depth': 11, 'learning_rate': 0.06283024416172218, 'num_leaves': 84, 'min_child_samples': 27, 'subsample': 0.7319768242701645, 'colsample_bytree': 0.8358386755920538}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:34,865] Trial 27 finished with value: 0.988046357615894 and parameters: {'n_estimators': 72, 'max_depth': 9, 'learning_rate': 0.19376403393749372, 'num_leaves': 66, 'min_child_samples': 24, 'subsample': 0.6259645251246125, 'colsample_bytree': 0.7154999571473976}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:36,058] Trial 28 finished with value: 0.9858940397350994 and parameters: {'n_estimators': 140, 'max_depth': 12, 'learning_rate': 0.03913187437861758, 'num_leaves': 89, 'min_child_samples': 30, 'subsample': 0.5473337172086158, 'colsample_bytree': 0.7598952826335534}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:37,174] Trial 29 finished with value: 0.9886258278145695 and parameters: {'n_estimators': 122, 'max_depth': 11, 'learning_rate': 0.1091493561308604, 'num_leaves': 71, 'min_child_samples': 25, 'subsample': 0.7734539918533418, 'colsample_bytree': 0.6176243058213016}. Best is trial 21 with value: 0.9992218543046357.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:38,896] Trial 30 finished with value: 0.9998013245033113 and parameters: {'n_estimators': 186, 'max_depth': 10, 'learning_rate': 0.2308241918845794, 'num_leaves': 80, 'min_child_samples': 28, 'subsample': 0.5930624396802083, 'colsample_bytree': 0.6966282685184425}. Best is trial 30 with value: 0.9998013245033113.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:40,369] Trial 31 finished with value: 0.9996688741721854 and parameters: {'n_estimators': 188, 'max_depth': 10, 'learning_rate': 0.22591045614726213, 'num_leaves': 79, 'min_child_samples': 28, 'subsample': 0.5851275540407161, 'colsample_bytree': 0.7017899837622263}. Best is trial 30 with value: 0.9998013245033113.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:42,192] Trial 32 finished with value: 0.9999834437086093 and parameters: {'n_estimators': 188, 'max_depth': 10, 'learning_rate': 0.24414879018066415, 'num_leaves': 100, 'min_child_samples': 28, 'subsample': 0.6558116687610593, 'colsample_bytree': 0.6920605429385731}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:44,032] Trial 33 finished with value: 0.9999834437086093 and parameters: {'n_estimators': 187, 'max_depth': 9, 'learning_rate': 0.2460765816142785, 'num_leaves': 99, 'min_child_samples': 22, 'subsample': 0.661669402766543, 'colsample_bytree': 0.6407430160435253}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:45,662] Trial 34 finished with value: 0.9998675496688741 and parameters: {'n_estimators': 177, 'max_depth': 9, 'learning_rate': 0.24629647042407782, 'num_leaves': 95, 'min_child_samples': 21, 'subsample': 0.6468872498208817, 'colsample_bytree': 0.639867980160166}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:47,064] Trial 35 finished with value: 0.9999172185430464 and parameters: {'n_estimators': 148, 'max_depth': 9, 'learning_rate': 0.26650882847698426, 'num_leaves': 100, 'min_child_samples': 21, 'subsample': 0.662317427269006, 'colsample_bytree': 0.6491395888173809}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:48,577] Trial 36 finished with value: 0.999751655629139 and parameters: {'n_estimators': 152, 'max_depth': 9, 'learning_rate': 0.26720063998394655, 'num_leaves': 100, 'min_child_samples': 21, 'subsample': 0.6562377632377491, 'colsample_bytree': 0.6400038036924921}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:50,375] Trial 37 finished with value: 0.9998841059602649 and parameters: {'n_estimators': 160, 'max_depth': 9, 'learning_rate': 0.29817025181918144, 'num_leaves': 93, 'min_child_samples': 20, 'subsample': 0.7045603504286949, 'colsample_bytree': 0.5689650200307164}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002769 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:51,549] Trial 38 finished with value: 0.989569536423841 and parameters: {'n_estimators': 149, 'max_depth': 7, 'learning_rate': 0.16105074248042653, 'num_leaves': 94, 'min_child_samples': 19, 'subsample': 0.7081263484968023, 'colsample_bytree': 0.585305323662451}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:52,550] Trial 39 finished with value: 0.9854966887417218 and parameters: {'n_estimators': 116, 'max_depth': 7, 'learning_rate': 0.021495439628771925, 'num_leaves': 100, 'min_child_samples': 23, 'subsample': 0.6898804788136956, 'colsample_bytree': 0.5655228900344649}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:53,645] Trial 40 finished with value: 0.9996854304635762 and parameters: {'n_estimators': 133, 'max_depth': 9, 'learning_rate': 0.2936998712668536, 'num_leaves': 90, 'min_child_samples': 19, 'subsample': 0.7734274239565215, 'colsample_bytree': 0.508046257087283}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:55,061] Trial 41 finished with value: 0.9999834437086093 and parameters: {'n_estimators': 159, 'max_depth': 9, 'learning_rate': 0.290771721753066, 'num_leaves': 94, 'min_child_samples': 22, 'subsample': 0.6669765659891476, 'colsample_bytree': 0.6400884438060099}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:56,526] Trial 42 finished with value: 0.9999172185430464 and parameters: {'n_estimators': 165, 'max_depth': 8, 'learning_rate': 0.29746668735993576, 'num_leaves': 92, 'min_child_samples': 16, 'subsample': 0.677957379410502, 'colsample_bytree': 0.6610758973930178}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:58,025] Trial 43 finished with value: 0.9989900662251656 and parameters: {'n_estimators': 163, 'max_depth': 8, 'learning_rate': 0.2413328333559336, 'num_leaves': 98, 'min_child_samples': 17, 'subsample': 0.6719763896470975, 'colsample_bytree': 0.6674764327878281}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:31:59,213] Trial 44 finished with value: 0.9925993377483444 and parameters: {'n_estimators': 147, 'max_depth': 8, 'learning_rate': 0.1715667574465274, 'num_leaves': 91, 'min_child_samples': 22, 'subsample': 0.6350146070084688, 'colsample_bytree': 0.6220216228294634}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:32:00,940] Trial 45 finished with value: 0.9854966887417218 and parameters: {'n_estimators': 174, 'max_depth': 9, 'learning_rate': 0.010785071075589837, 'num_leaves': 96, 'min_child_samples': 16, 'subsample': 0.6714342653396338, 'colsample_bytree': 0.6555216601122392}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003834 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:32:02,284] Trial 46 finished with value: 0.9965562913907284 and parameters: {'n_estimators': 137, 'max_depth': 7, 'learning_rate': 0.2940223358879056, 'num_leaves': 86, 'min_child_samples': 16, 'subsample': 0.7360069164062335, 'colsample_bytree': 0.6160925710518022}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002663 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:32:03,696] Trial 47 finished with value: 0.9856125827814569 and parameters: {'n_estimators': 157, 'max_depth': 8, 'learning_rate': 0.033823777145017116, 'num_leaves': 98, 'min_child_samples': 25, 'subsample': 0.7566310724833678, 'colsample_bytree': 0.6584562941255101}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:32:05,029] Trial 48 finished with value: 0.999453642384106 and parameters: {'n_estimators': 146, 'max_depth': 9, 'learning_rate': 0.2437544458022232, 'num_leaves': 92, 'min_child_samples': 23, 'subsample': 0.7121902010774463, 'colsample_bytree': 0.7777989781291647}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014503 -> initscore=-4.218769
[LightGBM] [Info] Start training from score -4.218769


[I 2024-10-25 12:32:06,117] Trial 49 finished with value: 0.9900496688741722 and parameters: {'n_estimators': 131, 'max_depth': 8, 'learning_rate': 0.1486201422376987, 'num_leaves': 87, 'min_child_samples': 19, 'subsample': 0.6555620874191265, 'colsample_bytree': 0.7454664074422042}. Best is trial 32 with value: 0.9999834437086093.


[LightGBM] [Info] Number of positive: 876, number of negative: 59524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1510
[LightGBM] [Info] Number of data points in the train set: 60400, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Preprocess events


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  events['score'].fillna(events['rating'], inplace=True)


Compute TF-IDF
Run ALS


  0%|          | 0/30 [00:00<?, ?it/s]

Postprocess ALS prediction
Generating item features
Generating user features
Merging features to interactions
Running CatBoost scoring


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  item_features['item_bookmark_count'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  user_features['recent_watch_count'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

Generating item features
Generating user features
Merging features to interactions
Running LightGBM scoring


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features[col].fillna(0, inplace=True)


Получаем наш сабмишен

In [23]:
# Создание DataFrame для сабмита
submission = final_recommendations[['user_id', 'item_id']].copy()

# Группировка по user_id и объединение item_id в строку, разделенную пробелами
submission = (
    submission
    .groupby('user_id')['item_id']
    .apply(lambda x: ' '.join(map(str, x)))
    .reset_index()
)

# Сохранение сабмита в CSV файл
submission.to_csv('submission_bobs.csv', index=False)

In [24]:
submission

Unnamed: 0,user_id,item_id
0,0,785 1001 2732 760 1545 2256 3463 1248 1811 331
1,1,3205 106 232 3656 2175 1491 1246 1801 1039 1686
2,2,1781 1809 2774 2185 2639 2338 3005 452 2342 2354
3,3,2194 3318 3528 2186 1861 3472 3435 183 1617 3327
4,4,487 37 2076 2774 1858 1809 3677 188 1337 1191
...,...,...
6035,6035,1855 1011 113 717 2366 1375 1216 1374 1403 3013
6036,6036,1379 2798 3473 3013 494 3153 3142 772 2370 2297
6037,6037,1968 1840 1775 3212 2603 618 2335 2784 3583 1747
6038,6038,1289 1808 2626 1100 361 1070 3105 2587 182 2920
