In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('data/train.csv')
member = pd.read_csv('data/members.csv')
songs = pd.read_csv('data/songs.csv')
extra_song = pd.read_csv('data/song_extra_info.csv')

# Исследование данных

# Тренировочные данные

In [None]:
train.head(5)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1.0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1.0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1.0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1.0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1.0


msno - пользователь\
song_id = композиция\
sourse_system_tab, source_screen_name, source_type - контекстная информация\
target - повторное прослушивание в течении месяца

In [None]:
train.describe()

Unnamed: 0,target
count,23371.0
mean,0.761029
std,0.426464
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [None]:
def summary(df):
    """
    ввод: Функция принимает фрейм данных
    вывод: Подсчитывается общее количество записей и уникальных записей, присутствующих в каждом столбце
    """

    print("Total number of records: ",len(df))
    for i in df.columns:
        print('Distinct {} in dataframe: {}'.format(i,len(np.unique(df[i].astype('str')))))

def count_na(df):
    """
    входные данные: Функция принимает фрейм данных
    выходные данные: Подсчитывает количество значений NA в каждом столбце и % значений NA
    """
    new=pd.DataFrame(df.isnull().astype('int').sum(axis=0),columns=["NA_count"])
    new['Percentage']=df.isnull().astype('int').sum(axis=0)*100/len(df)
    return new

def test_train_diff(train, test):
    count = dict()
    columns = set(train.columns).intersection(test.columns)
    for col in columns:
        set1 = set(train[col])
        set2 = set(test[col])
        new_item = len(set2-set1)
        count[col] = new_item
    return count

In [None]:
summary(train)

Total number of records:  85990
Distinct msno in dataframe: 5732
Distinct song_id in dataframe: 26388
Distinct source_system_tab in dataframe: 9
Distinct source_screen_name in dataframe: 19
Distinct source_type in dataframe: 12
Distinct target in dataframe: 3


In [None]:
np.mean(train.target) # проверяем - сбалансированы ли данные

0.7529218853574294

In [None]:
count_na(train) # проверяем - наличие пропущенных значений

Unnamed: 0,NA_count,Percentage
msno,0,0.0
song_id,1,0.001163
source_system_tab,254,0.295383
source_screen_name,3494,4.063263
source_type,158,0.183742
target,1,0.001163


# Данные member

In [None]:
member.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,20150628,20170622
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,20160411,20170712
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,20150906,20150907
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,20170126,20170613


city - город\
bd - возраст\
gender - пол\
registered_via - метод регистрации\
registered_init_time - время регистрации\
expiration_data - окончание подписки

In [None]:
member.registration_init_time = pd.to_datetime(
    member.registration_init_time.astype('str'))
member.expiration_date = pd.to_datetime(member.expiration_date.astype('str'))

In [None]:
member.dtypes

msno                              object
city                               int64
bd                                 int64
gender                            object
registered_via                     int64
registration_init_time    datetime64[ns]
expiration_date           datetime64[ns]
dtype: object

In [None]:
summary(member)

Total number of records:  34403
Distinct msno in dataframe: 34403
Distinct city in dataframe: 21
Distinct bd in dataframe: 95
Distinct gender in dataframe: 3
Distinct registered_via in dataframe: 6
Distinct registration_init_time in dataframe: 3862
Distinct expiration_date in dataframe: 1484


In [None]:
member.describe()

Unnamed: 0,city,bd,registered_via
count,34403.0,34403.0,34403.0
mean,5.371276,12.280935,5.953376
std,6.243929,18.170251,2.287534
min,1.0,-43.0,3.0
25%,1.0,0.0,4.0
50%,1.0,0.0,7.0
75%,10.0,25.0,9.0
max,22.0,1051.0,16.0


In [None]:
count_na(member)

Unnamed: 0,NA_count,Percentage
msno,0,0.0
city,0,0.0
bd,0,0.0
gender,19902,57.849606
registered_via,0,0.0
registration_init_time,0,0.0
expiration_date,0,0.0


# Данные songs

In [None]:
songs.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


song_length - длина композиции\
genre_ids - жанр\
artist_name - исполнитель\
composer - композитор\
lyricist - автор текста\
language - язык композиции

In [None]:
summary(songs)

Total number of records:  31674
Distinct song_id in dataframe: 31674
Distinct song_length in dataframe: 14541
Distinct genre_ids in dataframe: 330
Distinct artist_name in dataframe: 11215
Distinct composer in dataframe: 12224
Distinct lyricist in dataframe: 5778
Distinct language in dataframe: 11


In [None]:
count_na(songs)

Unnamed: 0,NA_count,Percentage
song_id,0,0.0
song_length,0,0.0
genre_ids,532,1.679611
artist_name,0,0.0
composer,13230,41.769274
lyricist,22820,72.046473
language,1,0.003157


# Данные extra_song

In [None]:
extra_song.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


In [None]:
summary(extra_song)

Total number of records:  40926
Distinct song_id in dataframe: 40926
Distinct name in dataframe: 35997
Distinct isrc in dataframe: 34665


In [None]:
count_na(extra_song)

Unnamed: 0,NA_count,Percentage
song_id,0,0.0
name,0,0.0
isrc,5486,13.404682


# Тестовые данные

In [None]:
test = pd.read_csv("data/test.csv")
test.head()

Unnamed: 0,id,msno,song_id,source_system_tab,source_screen_name,source_type
0,0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,my library,Local playlist more,local-library
1,1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,my library,Local playlist more,local-library
2,2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,discover,,song-based-playlist
3,3,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,radio,Radio,radio
4,4,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=,radio,Radio,radio


In [None]:
summary(test)

Total number of records:  224443
Distinct id in dataframe: 224443
Distinct msno in dataframe: 12209
Distinct song_id in dataframe: 53386
Distinct source_system_tab in dataframe: 9
Distinct source_screen_name in dataframe: 19
Distinct source_type in dataframe: 14


In [None]:
test_train_diff(train, test)

{'source_type': 2,
 'source_screen_name': 1,
 'source_system_tab': 1,
 'msno': 10350,
 'song_id': 45965}

Итак, мы видим, что у нас есть 10350 новых msn, которых не было в обучающих данных. У нас также есть 1 новый source_screen_name и 45965 новых песни.

# Подготовка признаков

In [3]:
date_columns = ['expiration_date', 'registration_init_time']

train_data = pd.read_csv('data/train.csv')
train_data = train_data.drop(train_data.index[-1]) # у последней строчки target == None
test_data = pd.read_csv('data/test.csv', index_col=0)
item_data = pd.read_csv('data/songs.csv')
user_data = pd.read_csv('data/members.csv', parse_dates=date_columns)

In [4]:
all_data = pd.concat([train_data, test_data])

all_data = all_data.merge(item_data, on='song_id', how='left')
all_data = all_data.merge(user_data, on='msno', how='left')

Теперь преобразуем категориальные фичи в числовой формат и пофиксим пропущенные значения

In [5]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

for col in [
    'msno', 'song_id', 'source_screen_name',
    'source_system_tab', 'source_type', 'genre_ids',
    'artist_name', 'composer', 'lyricist', 'gender'
]:
    all_data[col] = enc.fit_transform(all_data[col].fillna('nan'))

for col in ['language', 'city', 'registered_via']:
    all_data[col] = enc.fit_transform(all_data[col].fillna(-2))

Есть все данные - all_data, располженные по времени. В конце - test часть. После нее небольшая val часть, для проверки решения. train часть - последний кусок из размеченных данных (с уже выкинутой валидационной частью), такого же размера, что и test и еще один train немного сдвинутый по времени влево. Также использую историю для генерации новых фичей.

In [6]:
all_data['time'] = all_data.index / len(all_data)

n = len(train_data)
train_data = all_data[:int(n * 0.8)]
val_data = all_data[int(n * 0.8) : n]
test_data = all_data[n:]

train_data.to_hdf('data/train_data.hdf', key='wsdm')
val_data.to_hdf('data/val_data.hdf', key='wsdm')
test_data.to_hdf('data/test_data.hdf', key='wsdm')

# Теперь создадим дополнительные признаки

train и test разбиты по времени, значит хочется сделать валидацию также разбитую по времени, но временных меток изначально не было, но таблица с данными отсортирована по времени, значит можно использовать индекс, как время. Это приводит к ликам, потому что можно посмотреть - слушал ли этот пользователь этого исполнителя в будущем и т. п. То есть мы получаем фичи из будущего.

In [7]:
from itertools import combinations
from scipy.sparse import coo_matrix
import scipy.sparse as sp

In [8]:
date_cols = ['expiration_date', 'registration_init_time']

train_data = pd.read_hdf('data/train_data.hdf', parse_dates=date_cols)
val_data = pd.read_hdf('data/val_data.hdf', parse_dates=date_cols)
test_data = pd.read_hdf('data/test_data.hdf', parse_dates=date_cols)

In [9]:
all_data = pd.concat([train_data, val_data, test_data])

Сделаем два тренировочных датасета, которые будут меньшего размера и сдвинуты относительно друг друга. Также у нас будет храниться история, так как есть связь со временем. История храниться для каждого датасета.

In [10]:
df_test = test_data
df_history_test = train_data

df_val = val_data
df_history_val = train_data

df_trains = []
df_history_trains = []

n = len(test_data) + len(val_data)
shift = int(0.05*len(train_data))

for i in range(2):
    m = -i*shift
    if m == 0:
        m = None
    df_trains.append(train_data[-(n + i*shift):m])
    df_history_trains.append(train_data[:-(n + i*shift)])

Нормализуем некатегориальные фичи

In [11]:
not_categorical_columns = [
    'target',
    'song_length',
    'registration_init_time',
    'expiration_date',
    'time',
    'bd',
]
categorical_columns = all_data.columns.difference(not_categorical_columns)

orders = {}

for col in categorical_columns:
    orders[col] = 10 ** (int(np.log(all_data[col].max() + 1) / np.log(10)) + 1)

Придумаем новые признаки:\
В основном у нас признаки категориальные. Основная идея - группировать признаки по парам и тройкам и применять некоторую функцию. Самое первое, что можно сделать с категориальными фичами - это посчитать среднее и количество, затем можно генерировать разные фичи связнные со временем прослушивания и в конце - фича про долю уникальных песен. \
count_from_future - count только из будущего\
count_from_past - count только из прошлого

In [12]:
def get_group(df, cols):

    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]

    return group


def mean(df_history, df, cols):

    group = get_group(df, cols)
    group_history = get_group(df_history, cols)

    mean_map = df_history.groupby(group_history).target.mean()

    return group.map(mean_map).fillna(-1)


def count(df_history, df, cols):

    group = get_group(df, cols)
    group_all = get_group(all_data, cols)

    count_map = group_all.value_counts()

    return group.map(count_map).fillna(0)

def time_from_prev_heard(df_history, df, cols):

    group = get_group(df, cols)
    group_history = get_group(df_history, cols)

    last_heard = df_history.groupby(group_history).time.last().to_dict()

    result = []
    for t, g in zip(df.time, group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t

    return result


def time_to_next_heard(df_history, df, cols):

    result = []
    df_reverse = df.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)

    next_heard = {}
    for g, t in zip(group, df_reverse.time):
        if g in next_heard:
            result.append(t - next_heard[g])
        else:
            result.append(-1)
        next_heard[g] = t

    result.reverse()
    return result


def count_from_future(df_history, df, cols):

    result = []
    df_reverse = df.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)

    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1
        else:
            result.append(0)
            count[g] = 1

    result.reverse()
    return result


def count_from_past(df_history, df, cols):

    group = get_group(df, cols)

    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])

    return result


def last_time_diff(df_history, df, cols):

    group = get_group(df, cols)

    last_time = df.groupby(group).time.last()

    return group.map(last_time) - df.time


def part_of_unique_song(df):

    group = get_group(all_data, ['msno', 'artist_name'])
    group_df = get_group(df, ['msno', 'artist_name'])

    num_song_by_artist = all_data.groupby('artist_name').song_id.nunique()
    num_song_by_user_artist = all_data.groupby(group).song_id.nunique()

    s1 = df.artist_name.map(num_song_by_artist)
    s2 = group_df.map(num_song_by_user_artist)

    return s2 / s1

Как видно выше, довольно много разных алгоритмов генерации признаков, но к сожалению, если использовать больше чем 2, то у меня заканчивается ОЗУ. Тем самым я методом перебора выбрал два самых лучших признака - mean (среднее) и last_time_diff (разница времен).

In [13]:
def col_name(cols, func):
    return '_'.join(cols) + '_' + func.__name__


def create_features(df, df_history):

    X = pd.DataFrame()

    for num_col in [1, 2]:
        for cols in combinations(categorical_columns, num_col):
            for func in [
                mean,
                # count,
                # time_to_next_heard,
                # count_from_future,
                last_time_diff,
                # count_from_past
            ]:
                X[col_name(cols, func)] = func(df_history, df, list(cols))

    return X

In [14]:
Xtest = create_features(df_test, df_history_test)
Xval = create_features(df_val, df_history_val)
Xtrain0 = create_features(df_trains[0], df_history_trains[0])
Xtrain1 = create_features(df_trains[1], df_history_trains[1])

  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)] = func(df_history, df, list(cols))
  X[col_name(cols, func)

In [15]:
Xtest.to_hdf('data/Xtest.hdf', key='abc')

Xval.to_hdf('data/Xval.hdf', key='abc')
df_val.target.to_hdf('data/yval.hdf', key='abc')

Xtrain0.to_hdf('data/Xtrain0.hdf', key='abc')
Xtrain1.to_hdf('data/Xtrain1.hdf', key='abc')

df_trains[0].target.to_hdf('data/ytrain0.hdf', key='abc')
df_trains[1].target.to_hdf('data/ytrain1.hdf', key='abc')

# Обучение
Использую 4 модели - CatBoost и XGBoost на первом и втором тренировочном датасете и затем их совмещаю с некоторыми коэффициентами.

In [16]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [18]:
import joblib
import xgboost
import catboost

In [19]:
Xtrain0 = pd.read_hdf('data/Xtrain0.hdf')
ytrain0 = pd.read_hdf('data/ytrain0.hdf')
Xtrain1 = pd.read_hdf('data/Xtrain1.hdf')
ytrain1 = pd.read_hdf('data/ytrain1.hdf')

Xtest = pd.read_hdf('data/Xtest.hdf')

Xval = pd.read_hdf('data/Xval.hdf')
yval = pd.read_hdf('data/yval.hdf')

In [27]:
model_xgb = xgboost.XGBClassifier(
    learning_rate=0.03,
    max_depth=7,
    nthread=50,
    seed=1,
    n_estimators=750,
)
model_cb = catboost.CatBoostClassifier(
    iterations=2000,
    learning_rate=0.03,
    depth=7,
    loss_function='Logloss',
    thread_count=50,
    random_seed=1,
)

In [28]:
model_cb.fit(Xtrain0, ytrain0)
p = model_cb.predict_proba(Xval)[:,1]
joblib.dump(p, 'p0_cb_prob_val')
p = model_cb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p0_cb_prob_test')

0:	learn: 0.6832703	total: 264ms	remaining: 8m 47s
1:	learn: 0.6739180	total: 445ms	remaining: 7m 24s
2:	learn: 0.6651828	total: 614ms	remaining: 6m 49s
3:	learn: 0.6571305	total: 796ms	remaining: 6m 37s
4:	learn: 0.6493433	total: 970ms	remaining: 6m 27s
5:	learn: 0.6421037	total: 1.16s	remaining: 6m 24s
6:	learn: 0.6354120	total: 1.33s	remaining: 6m 20s
7:	learn: 0.6290926	total: 1.51s	remaining: 6m 15s
8:	learn: 0.6231428	total: 1.69s	remaining: 6m 13s
9:	learn: 0.6175479	total: 1.86s	remaining: 6m 11s
10:	learn: 0.6123702	total: 2.04s	remaining: 6m 9s
11:	learn: 0.6075275	total: 2.22s	remaining: 6m 7s
12:	learn: 0.6028237	total: 2.4s	remaining: 6m 7s
13:	learn: 0.5985934	total: 2.57s	remaining: 6m 4s
14:	learn: 0.5946168	total: 2.74s	remaining: 6m 2s
15:	learn: 0.5907483	total: 2.93s	remaining: 6m 3s
16:	learn: 0.5871494	total: 3.12s	remaining: 6m 3s
17:	learn: 0.5837778	total: 3.3s	remaining: 6m 3s
18:	learn: 0.5804888	total: 3.47s	remaining: 6m 2s
19:	learn: 0.5776683	total: 3.65s

['p0_cb_prob_test']

In [29]:
model_cb.fit(Xtrain1, ytrain1)
p = model_cb.predict_proba(Xval)[:,1]
joblib.dump(p, 'p1_cb_prob_val')
p = model_cb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p1_cb_prob_test')

0:	learn: 0.6831082	total: 223ms	remaining: 7m 26s
1:	learn: 0.6736468	total: 411ms	remaining: 6m 50s
2:	learn: 0.6647910	total: 581ms	remaining: 6m 26s
3:	learn: 0.6566441	total: 760ms	remaining: 6m 19s
4:	learn: 0.6487152	total: 946ms	remaining: 6m 17s
5:	learn: 0.6414057	total: 1.14s	remaining: 6m 19s
6:	learn: 0.6346424	total: 1.32s	remaining: 6m 16s
7:	learn: 0.6281957	total: 1.5s	remaining: 6m 13s
8:	learn: 0.6221896	total: 1.7s	remaining: 6m 15s
9:	learn: 0.6165965	total: 1.86s	remaining: 6m 9s
10:	learn: 0.6113503	total: 2.03s	remaining: 6m 7s
11:	learn: 0.6064205	total: 2.23s	remaining: 6m 8s
12:	learn: 0.6016900	total: 2.4s	remaining: 6m 7s
13:	learn: 0.5974180	total: 2.58s	remaining: 6m 6s
14:	learn: 0.5932739	total: 2.76s	remaining: 6m 4s
15:	learn: 0.5893960	total: 2.94s	remaining: 6m 4s
16:	learn: 0.5857292	total: 3.11s	remaining: 6m 2s
17:	learn: 0.5823180	total: 3.29s	remaining: 6m 2s
18:	learn: 0.5791794	total: 3.48s	remaining: 6m 2s
19:	learn: 0.5763125	total: 3.67s	r

['p1_cb_prob_test']

In [30]:
model_xgb.fit(Xtrain0, ytrain0)
p = model_xgb.predict_proba(Xval)[:,1]
joblib.dump(p, 'p0_xgb_prob_val')
p = model_cb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p0_xgb_prob_test')

['p0_xgb_prob_test']

In [31]:
model_xgb.fit(Xtrain1, ytrain1)
p = model_xgb.predict_proba(Xval)[:,1]
joblib.dump(p, 'p1_xgb_prob_val')
p = model_cb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p1_xgb_prob_test')

['p1_xgb_prob_test']

In [32]:
from sklearn.metrics import ndcg_score

p0_cb = joblib.load('p0_cb_prob_val')
p1_cb = joblib.load('p1_cb_prob_val')
p0_xgb = joblib.load('p0_xgb_prob_val')
p1_xgb = joblib.load('p1_xgb_prob_val')

p_cb = 0.6 * p0_cb + 0.4 * p1_cb
p_xgb = 0.6 * p0_xgb + 0.4 * p1_xgb

p = 0.6 * p_cb + 0.4 * p_xgb

def ndcg_at_k(y_true, y_pred, k = 20):
  sorted_indices = np.argsort(y_pred)[::-1]
  y_true = y_true[sorted_indices][:k]
  y_pred = y_pred[sorted_indices][:k]

  return ndcg_score([y_true], [y_pred])

print("Xval score (used XGBoost): ", ndcg_at_k(yval.values, p_xgb))
print("Xval score (used CatBoost): ", ndcg_at_k(yval.values, p_cb))
print("Xval score (used XGBoost + CatBoost): ", ndcg_at_k(yval.values, p))

Xval score (used XGBoost):  0.9764802592558764
Xval score (used CatBoost):  0.9967222855559115
Xval score (used XGBoost + CatBoost):  0.9829517823894169


Теперь получим вероятности для тестовых данных:

In [34]:
p0_cb = joblib.load('p0_cb_prob_test')
p1_cb = joblib.load('p1_cb_prob_test')
p0_xgb = joblib.load('p0_xgb_prob_test')
p1_xgb = joblib.load('p1_xgb_prob_test')

p_cb = 0.6 * p0_cb + 0.4 * p1_cb
p_xgb = 0.6 * p0_xgb + 0.4 * p1_xgb

p = 0.6 * p_cb + 0.4 * p_xgb
p[:5]

array([0.92377323, 0.94636091, 0.53638291, 0.04081557, 0.25016513])