In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.metrics import ndcg_score, make_scorer

from sklearn.linear_model import LogisticRegression
import catboost as cb
import xgboost as xgb

In [2]:
# в сыром виде набор обучающих данных занимает порядка 1.1 ГБ, 
# Сократим объём памяти с помощью изменения типа данных
def read_data_memory_efficiently(path_to_csv, eff = True):
    df = pd.read_csv(path_to_csv)

    if eff:
    
        for column in df.columns:
            column_dtype = df[column].dtype
            
            if column_dtype != object:
                column_min = df[column].min()
                column_max = df[column].max()
                
                if str(column_dtype)[:3] == 'int':
                    if column_min > np.iinfo(np.int8).min and column_max < np.iinfo(np.int8).max:
                        df[column] = df[column].astype(np.int8)
                    elif column_min > np.iinfo(np.int16).min and column_max < np.iinfo(np.int16).max:
                        df[column] = df[column].astype(np.int16)
                    elif column_min > np.iinfo(np.int32).min and column_max < np.iinfo(np.int32).max:
                        df[column] = df[column].astype(np.int32)
                    elif column_min > np.iinfo(np.int64).min and column_max < np.iinfo(np.int64).max:
                        df[column] = df[column].astype(np.int64)  
                else:
                    if column_min > np.finfo(np.float16).min and column_max < np.finfo(np.float16).max:
                        df[column] = df[column].astype(np.float16)
                    elif column_min > np.finfo(np.float32).min and column_max < np.finfo(np.float32).max:
                        df[column] = df[column].astype(np.float32)
                    else:
                        df[column] = df[column].astype(np.float64)

        result_mem_usage = df.memory_usage().sum() / 1024**2
        print('Memory usage of {0} dataframe: {1} MB'.format(path_to_csv, result_mem_usage))
    
    return df

def show_missing_percent(df):
    C=len(df.columns)
    L=len(df.index)
    CN=df.count() #количество
    NP=((L-CN)/L)*100 #процент пропущенных значений
    f = pd.DataFrame([NP], index=['Процент пропусков']).style.apply(
                lambda x: ["background-color:tomato" if i>=35 else "" for i in x], axis=1, subset=(["Процент пропусков"], slice(None))
        ).apply(
                lambda x: ["background-color:steelblue" if (i>5 and i<35) else "" for i in x], axis=1, subset=(["Процент пропусков"], slice(None))
        )
    return f

In [3]:
songs_df = read_data_memory_efficiently('data//songs.csv')
members_df = read_data_memory_efficiently('data//members.csv')
train_df = read_data_memory_efficiently('data//train.csv')

Memory usage of data//songs.csv dataframe: 100.73743057250977 MB
Memory usage of data//members.csv dataframe: 0.9187850952148438 MB
Memory usage of data//train.csv dataframe: 288.4619426727295 MB


### Обрабатываем пользователей

In [4]:
members_df.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,20150628,20170622
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,20160411,20170712
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,20150906,20150907
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,20170126,20170613


In [5]:
members_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   msno                    34403 non-null  object
 1   city                    34403 non-null  int8  
 2   bd                      34403 non-null  int16 
 3   gender                  14501 non-null  object
 4   registered_via          34403 non-null  int8  
 5   registration_init_time  34403 non-null  int32 
 6   expiration_date         34403 non-null  int32 
dtypes: int16(1), int32(2), int8(2), object(2)
memory usage: 940.8+ KB


In [4]:
show_missing_percent(members_df)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
Процент пропусков,0.0,0.0,0.0,57.849606,0.0,0.0,0.0


In [34]:
# создаём фичу - время пользования пребываения пользователем платформы
members_df['registration_init_time'] = pd.to_datetime(members_df['registration_init_time'], format='%Y%m%d', errors='ignore')
members_df['registration_init_time_year'] = members_df['registration_init_time'].dt.year

members_df['expiration_date'] = pd.to_datetime(members_df['expiration_date'], format='%Y%m%d', errors='ignore')
members_df['period_of_membership'] = (members_df['expiration_date'] - members_df['registration_init_time']).dt.days.astype(int)
members_df['expiration_date_year'] = members_df['expiration_date'].dt.year

members_df.drop(columns=['registration_init_time', 'expiration_date'], inplace=True)
members_df.head()

  members_df['registration_init_time'] = pd.to_datetime(members_df['registration_init_time'], format='%Y%m%d', errors='ignore')
  members_df['expiration_date'] = pd.to_datetime(members_df['expiration_date'], format='%Y%m%d', errors='ignore')


Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time_year,period_of_membership,expiration_date_year
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,2011,2223,2017
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,2015,725,2017
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,2016,457,2017
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,2015,1,2015
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,2017,138,2017


In [None]:
# удаляем гендер так как слишком много пропусков
members_df.drop(columns=['gender'], inplace=True)

In [None]:
# мин макс нормализация
ms = MinMaxScaler()
members_df['period_of_membership'] = ms.fit_transform(members_df['period_of_membership'])

In [5]:
def preprocess_members(members_df_):
    # создаём фичу - время пользования пребываения пользователем платформы
    members_df_['registration_init_time'] = pd.to_datetime(members_df_['registration_init_time'], format='%Y%m%d', errors='ignore')
    members_df_['registration_init_time_year'] = members_df_['registration_init_time'].dt.year

    members_df_['expiration_date'] = pd.to_datetime(members_df_['expiration_date'], format='%Y%m%d', errors='ignore')
    members_df_['period_of_membership'] = (members_df_['expiration_date'] - members_df_['registration_init_time']).dt.days.astype(int)
    members_df_['expiration_date_year'] = members_df_['expiration_date'].dt.year

    members_df_.drop(columns=['registration_init_time', 'expiration_date'], inplace=True)
    members_df_.drop(columns=['gender'], inplace=True)
    return members_df_

members_df = preprocess_members(members_df)
members_df.head()

  members_df_['registration_init_time'] = pd.to_datetime(members_df_['registration_init_time'], format='%Y%m%d', errors='ignore')
  members_df_['expiration_date'] = pd.to_datetime(members_df_['expiration_date'], format='%Y%m%d', errors='ignore')


Unnamed: 0,msno,city,bd,registered_via,registration_init_time_year,period_of_membership,expiration_date_year
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,7,2011,2223,2017
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,7,2015,725,2017
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,4,2016,457,2017
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,9,2015,1,2015
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,4,2017,138,2017


In [8]:
show_missing_percent(members_df)

Unnamed: 0,msno,city,bd,registered_via,registration_init_time_year,period_of_membership,expiration_date_year
Процент пропусков,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Обрабатываем песни

In [6]:
songs_df.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [10]:
songs_df['genre_ids'][5]

'864|857|850|843'

In [11]:
songs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   song_id      object 
 1   song_length  int32  
 2   genre_ids    object 
 3   artist_name  object 
 4   composer     object 
 5   lyricist     object 
 6   language     float16
dtypes: float16(1), int32(1), object(5)
memory usage: 100.7+ MB


In [12]:
show_missing_percent(songs_df)

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
Процент пропусков,0.0,0.0,4.098558,0.0,46.655431,84.714064,4.4e-05


In [35]:
# заполняем числовые данные
songs_df['song_length'] = songs_df['song_length'].astype(float)
songs_df['language'] = songs_df['language'].astype(float)

songs_df['song_length'] = songs_df['song_length'].fillna(songs_df['song_length'].mean(),inplace=False)
songs_df['language'] = songs_df['language'].fillna(songs_df['language'].mean(), inplace=False)

songs_df['song_length'] = songs_df['song_length'].astype(np.int32)
songs_df['language'] = songs_df['language'].astype(np.float16)

In [7]:
def count_genre_ids(genre):
    if genre == 'unknown':
        return 0
    else:
        return genre.count('|') + 1

def count_content(col):
    if col == 'unknown':
        return 0 
    return col.count('|') + col.count("\\") + col.count("/") + col.count(";") + 1

def fill_and_count(df: pd.DataFrame, column_name: str):
    if df[column_name].dtype == 'category':
        df[column_name] = df[column_name].astype('object') 
    df[column_name] = df[column_name].fillna('unknown')
    df[column_name+'_count'] = df[column_name].apply(func=count_content).astype(np.uint8)
    return df

In [40]:
for col in ['genre_ids', 'composer', 'artist_name']:
    songs_df = fill_and_count(songs_df, col)
songs_df.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,genre_ids_count,composer_count,artist_name_count
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0,1,1,1
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0,1,3,1
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,unknown,,31.0,1,0,1
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0,1,1,1
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0,1,1,1


In [8]:
def preprocess_songs(songs_df_):
    """Предобработка песен без удаления информации о жанрах"""
    # заполняем числовые данные
    songs_df_['song_length'] = songs_df_['song_length'].astype(float)
    songs_df_['language'] = songs_df_['language'].astype(float)

    songs_df_['song_length'] = songs_df_['song_length'].fillna(songs_df_['song_length'].mean(),inplace=False)
    songs_df_['language'] = songs_df_['language'].fillna(songs_df_['language'].mean(), inplace=False)

    songs_df_['song_length'] = songs_df_['song_length'].astype(np.int32)
    songs_df_['language'] = songs_df_['language'].astype(np.float16)

    for col in ['genre_ids', 'composer', 'artist_name']:
        songs_df_ = fill_and_count(songs_df_, col)
    
    songs_df_.drop(columns=['lyricist', 'artist_name', 'composer'], inplace=True)
    return songs_df_
songs_df = preprocess_songs(songs_df) 

In [9]:
show_missing_percent(songs_df)

Unnamed: 0,song_id,song_length,genre_ids,language,genre_ids_count,composer_count,artist_name_count
Процент пропусков,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
songs_df.drop(columns=['lyricist', 'artist_name', 'composer'], inplace=True)


In [10]:
# пока что удалим genre_ids
songs_df.drop(columns=['genre_ids'], inplace=True)

In [14]:
songs_df.head()

Unnamed: 0,song_id,song_length,language,genre_ids_count,composer_count,artist_name_count
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,3.0,1,1,1
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,31.0,1,3,1
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,31.0,1,0,1
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,3.0,1,1,1
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,52.0,1,1,1


### Songs extra info

In [34]:
songs_ei_df.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


### Обрабатываем train data

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   msno                object
 1   song_id             object
 2   source_system_tab   object
 3   source_screen_name  object
 4   source_type         object
 5   target              int8  
dtypes: int8(1), object(5)
memory usage: 288.5+ MB


In [12]:
show_missing_percent(train_df)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
Процент пропусков,0.0,0.0,0.336825,5.622618,0.291959,0.0


In [13]:
# кодируем категориальные признаки
# заполним пропуски модой
for col in ['source_system_tab', 'source_screen_name', 'source_type']:
    train_df[col] = train_df[col].fillna(train_df[col].mode().values[0])
    

for col in ['source_system_tab', 'source_screen_name', 'source_type']:
    lb = LabelEncoder()
    lb.fit(list(train_df[col].values))
    train_df[col] = lb.transform(list(train_df[col].values))

In [14]:
train_df.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1,7,6,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,3,8,4,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,3,8,4,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,3,8,4,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,1,7,6,1


### Мерджим данные

In [None]:
# # мерджим все данные и удаляем столбики с айди
train_df = pd.merge(train_df, songs_df, how='left', on='song_id')
train_df = pd.merge(train_df, members_df, how='left', on='msno')
train_df.drop(columns=['msno', 'song_id'], inplace=True)

del(songs_df)
del(members_df)

In [16]:
train_df.head()

Unnamed: 0,source_system_tab,source_screen_name,source_type,target,song_length,language,genre_ids_count,composer_count,artist_name_count,city,bd,registered_via,registration_init_time_year,period_of_membership,expiration_date_year
0,1,7,6,1,206471.0,52.0,1.0,2.0,1.0,1,0,7,2012,2103,2017
1,3,8,4,1,284584.0,52.0,1.0,0.0,1.0,13,24,9,2011,2301,2017
2,3,8,4,1,225396.0,52.0,1.0,1.0,1.0,13,24,9,2011,2301,2017
3,3,8,4,1,255512.0,-1.0,1.0,1.0,1.0,13,24,9,2011,2301,2017
4,1,7,6,1,187802.0,52.0,1.0,3.0,1.0,1,0,7,2012,2103,2017


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 15 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   source_system_tab            int32  
 1   source_screen_name           int32  
 2   source_type                  int32  
 3   target                       int8   
 4   song_length                  float64
 5   language                     float16
 6   genre_ids_count              float64
 7   composer_count               float64
 8   artist_name_count            float64
 9   city                         int8   
 10  bd                           int16  
 11  registered_via               int8   
 12  registration_init_time_year  int32  
 13  period_of_membership         int32  
 14  expiration_date_year         int32  
dtypes: float16(1), float64(4), int16(1), int32(6), int8(3)
memory usage: 443.2 MB


In [20]:
show_missing_percent(train_df)

Unnamed: 0,source_system_tab,source_screen_name,source_type,target,song_length,language,genre_ids_count,composer_count,artist_name_count,city,bd,registered_via,registration_init_time_year,period_of_membership,expiration_date_year
Процент пропусков,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# удалим несколько строчек, для которых не найдена песня
train_df.dropna(inplace=True)

## Обучение

In [22]:
# готовим данные
X_train = train_df.drop(columns=['target'])
y_train = train_df['target']
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=43)
y_test_dummy = pd.get_dummies(y_test).values.astype(int)

In [23]:
def train_and_test_model(model, parameters) -> float:
    model.fit(**parameters)
    print('Finished fitting\nStarted prediction')
    y_pred = model.predict(X_test)
    y_pred = pd.get_dummies(y_pred).astype(int).values
    score = ndcg_score(y_test_dummy, y_pred)
    return score

### Случайный лес

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
X_train = train_df.drop(columns=['target'])
X_train.to_numpy()
y_train = train_df['target'].values

In [31]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

### Логистическая регрессия

In [79]:
from sklearn.linear_model import LogisticRegression

In [80]:
log_reg = LogisticRegression(penalty='l2', tol=1e-3, n_jobs=2)

In [84]:
parameters_logreg = {
    'X': X_train,
    'y': y_train
}
log_reg_score = train_and_test_model(log_reg, parameters_logreg)
log_reg_score

Finished fitting
Started prediction


0.8444671459282674

### CatBoost

In [91]:
import catboost as cb

In [97]:
cb_model = cb.CatBoostClassifier(
    loss_function='CrossEntropy',
    n_estimators=250,
    learning_rate=0.05,
    max_depth=10,
)

In [98]:
parameters_cb = {
    'X': X_train,
    'y': y_train,
    'early_stopping_rounds': 5,
    'eval_set': [(X_test, y_test)],
    'verbose' :False
}
cb_score = train_and_test_model(cb_model, parameters_cb) 
cb_score

Finished fitting
Started prediction


0.8669356207478021

### XGBoost

In [45]:
import xgboost as xgb

In [89]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    n_estimators=250,
    learning_rate=0.05,
    colsample_bytree=0.70,
    max_depth=10,
    n_jobs=4
)

In [90]:
parameters_xgboost = {
    'X': X_train,
    'y': y_train,
    'early_stopping_rounds': 5,
    'eval_set': [(X_test, y_test)],
    'verbose' :False
}
xgboost_score = train_and_test_model(xgb_model, parameters_xgboost) 
xgboost_score



Finished fitting
Started prediction


0.8731065469345449

### Промежуточный вывод

Лучше всех себя показал классификатор XGBoost. Далее будем использовать его. Здесь попробуем происпользовать оптимизацию гиперпараметров

#### попробуем оптимизировать параметры

In [None]:
parameters_xgboost = {
    'X': X_train,
    'y': y_train,
    'early_stopping_rounds': 5,
    'eval_set': [(X_test, y_test)],
    'verbose' :False
}
xgboost_score = train_and_test_model(xgb_model, parameters_xgboost) 
xgboost_score

In [122]:
import time
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
def custom_ndcg_score(y, y_pred):
    y = pd.get_dummies(y).values.astype(int)
    y_pred = pd.get_dummies(y_pred).values.astype(int)
    score = ndcg_score(y, y_pred)
    return score

def objective(trial):
    # Define hyperparameters to be optimized
    params_xgboost= {
        'tree_method': 'hist',
        'enable_categorical': 'True',
        'objective': 'binary:logistic',
        'eval_metric': 'error',
        'n_estimators': trial.suggest_int('n_estimators', 30, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.01, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'random_state': 897465162
    } 
        
    model = xgb.XGBClassifier(**params_xgboost)
    
    # Create pipeline
    pipeline = Pipeline(steps=[('model', model)])
    
    # Evaluate model using cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=2376326)
    ndcg_scorer = make_scorer(custom_ndcg_score, response_method="predict") 
    scores = cross_val_score(pipeline, X_train, y_train, scoring=ndcg_scorer, cv=kf, n_jobs=-1)
    return scores.mean()

# Create study object and optimize hyperparameters
start_time = time.time()

best_xgboost = optuna.create_study(study_name='xgboost_opt', direction='maximize')
best_xgboost.optimize(objective, n_trials=5)

# Get the best hyperparameters for random forest 
best_params_xgboost = best_xgboost.best_params
end_time = time.time()
elapsed_time = (end_time - start_time)/60

print(f"elapsed time: {elapsed_time:.2f} minutes:")
print("Best hyperparameters found:", best_params_xgboost)

model = xgb.XGBClassifier(n_jobs=4, **best_params_xgboost)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
ncgd_score_best = custom_ndcg_score(y_test, y_pred)
print('Metric with best_params: ', ncgd_score_best)

[I 2024-04-30 14:18:45,886] A new study created in memory with name: xgboost_opt
[I 2024-04-30 14:19:01,425] Trial 0 finished with value: 0.8376090915714413 and parameters: {'n_estimators': 984, 'learning_rate': 0.001783756848661729, 'max_depth': 15}. Best is trial 0 with value: 0.8376090915714413.
[I 2024-04-30 14:19:02,055] Trial 1 finished with value: 0.85311004192144 and parameters: {'n_estimators': 286, 'learning_rate': 0.008480171411220884, 'max_depth': 4}. Best is trial 1 with value: 0.85311004192144.
[I 2024-04-30 14:19:08,365] Trial 2 finished with value: 0.8412997940357266 and parameters: {'n_estimators': 965, 'learning_rate': 0.0013094771555968096, 'max_depth': 8}. Best is trial 1 with value: 0.85311004192144.
[I 2024-04-30 14:19:23,245] Trial 3 finished with value: 0.8302276866428702 and parameters: {'n_estimators': 2546, 'learning_rate': 0.008970767515309857, 'max_depth': 13}. Best is trial 1 with value: 0.85311004192144.
[I 2024-04-30 14:19:36,406] Trial 4 finished with v

elapsed time: 0.84 minutes:
Best hyperparameters found: {'n_estimators': 286, 'learning_rate': 0.008480171411220884, 'max_depth': 4}
Metric with best_params:  0.8619910421134808


Вывод: Слишком долгие вычисления. Нецелесообразно

#### Подбор функции ошибки

In [25]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logitraw', 
    n_estimators=250,
    learning_rate=0.05,
    colsample_bytree=0.70,
    max_depth=10,
    n_jobs=4
)

In [None]:
parameters_xgboost = {
    'X': X_train,
    'y': y_train,
    'early_stopping_rounds': 5,
    'eval_set': [(X_test, y_test)],
    'verbose' :False
}
xgboost_score = train_and_test_model(xgb_model, parameters_xgboost) 
xgboost_score

### Разбиение жанров на отдельные фичи

**Комментарий к просиходящему:** Музыкальные жанры описаны через вертикальную черту. Разделим их на отдельные признаки и попробуем обучить XGBoostClassifier

# Заключение

Лучшая метрика качества была достигнута при использовании XGBoostClassifier и составила **0.87**. В качестве метрики использовалась logistic error. 
Результаты feature engineering:
1. Добавлены фичи:
- дата регистрации и окончания действия подписки у пользователя;
- время продления подписки;
- количество композиторов, исполлнителей, жанров произведения;
2. Проверена производительность моделей:
- LogistirRegression (ndcg_score = 0.8444671459282674)
- CatBoostClassifier (ndcg_score = 0.8669356207478021)
- XGBoostClassifier (ndcg_score = 0.8731065469345449)

Улучшение метрик возможно при:
- более тщательном подборе функции ошибки;
- разбиении музыкальных жанров на отдельные фичи для участия их в обучении;
- использовании дополнительной информации о музыке (isrc)