In [32]:
import numpy as np 
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm import tqdm 
import multiprocessing
from copy import deepcopy 
from gensim.models import Word2Vec

%config InlineBackend.figure_format = 'retina'

### Построение эмбеддингов:

In [2]:
train = pd.read_csv('train.csv')
songs = pd.read_csv('songs.csv')
songs_extra = pd.read_csv('song_extra_info.csv')

In [3]:
#Идентифицируем песню artist_name и ее name
merged_songs = pd.merge(songs, songs_extra, on=['song_id', 'song_id'], how='left')
merged_songs["artist_song"] = merged_songs[['artist_name', 'name']].apply(lambda x: str(x['artist_name']) + '+' + str(x['name']), axis=1)
train = train.loc[:, ['msno', 'song_id', 'target']].merge(merged_songs.loc[:, ['song_id', 'artist_song']], on='song_id', how = 'left')

In [4]:
#запомним для стекинга 
frame = deepcopy(train)
frame_np = frame.loc[:, ['msno', 'artist_song']].to_numpy()

In [5]:
# Берем те которых слушали
train = train.loc[train.target == 1, ['msno', 'artist_song']]

In [8]:
# Заменяем хеши, чтобы все быстрее работало, для этого два словаря
udict = {user_hash:i for i, user_hash in enumerate(train["msno"].unique())}
sdict = {song_hash:i for i, song_hash in enumerate(train["artist_song"].unique())}
train["msno"] = train["msno"].apply(lambda x: udict[x])
train["artist_song"] = train["artist_song"].apply(lambda x: sdict[x])

In [9]:
# Составляем текст - для каждого пользователя - те песни которые он слушал
text = []
X = train.to_numpy()
for user in tqdm(train.msno.unique(), position=0,leave=False):
    one_user_songs = X[X[:, 0] == user][:, 1]
    one_user_songs = [str(x) for x in one_user_songs] 
    text.append(one_user_songs)


                                                      

### Эмбеддинги для песен

In [10]:
# Строим CBOW 
model = Word2Vec(window=10,
                 size=100,
                 negative=15,
                 min_count=1,
                 sg=0,
                 workers = multiprocessing.cpu_count())

model.build_vocab(text)

In [11]:
%%time
model.train(text, total_examples=model.corpus_count, epochs=100, report_delay=1)

CPU times: user 34min, sys: 4.15 s, total: 34min 4s
Wall time: 3min 49s


(371304309, 371465600)

In [20]:
# for s in set(sdict):
#     try:
#         if 'Eminem' in s:
#             print(s)
#     except:
#         pass


In [18]:
# Берем Eminema и смотрим какие ему близки
sdict['Eminem+Beautiful']

21387

In [16]:
inverse_sdict = {j:i for i,j in sdict.items()}

def get_most_similar(idx):
    for sim in model.wv.most_similar(idx)[:5]:
        print(f"SCORE: {sim[1]:.2f}, {inverse_sdict[int(sim[0])]}")

In [19]:
get_most_similar('21387')

SCORE: 0.62, Eminem+Mockingbird
SCORE: 0.60, Eminem+When I'm Gone
SCORE: 0.57, Eminem+Legacy
SCORE: 0.57, Eminem+Cleanin' Out My Closet
SCORE: 0.56, Eminem+Survival


In [21]:
# Эмбеддинги для юзеров - берем среднее эмбеддингов песен, которые слушал пользователь 
user_embedings = {user : np.array([model.wv[str(i)] for i in text[i]]).mean(axis = 0)
                  for user, i in tqdm(enumerate(train.msno.unique()), position=0, leave=False)}

                             

In [22]:
# Добавляем скоры от эмбеддингов как доп фичу
scores = []

for row in tqdm(frame_np, position=0, leave=False):
    u, s = row
    score = 0
    if (u in udict) and (s in sdict) and (udict[u] in user_embedings) and (f'{sdict[s]}' in model.wv):
        score = np.sum(user_embedings[udict[u]] * model.wv[f'{sdict[s]}'])
    scores.append(score)

                                                             

In [23]:
frame['scores'] = scores

### Стекинг эмбеддингов и фичей посчитанных в 1-ой части

In [24]:
types = {'source_system_tab' : 'category',
         'source_screen_name' : 'category',
         'source_type' : 'category', 
         'genre_ids' : 'category',
         'artist_name' : 'category'}

part_1 = pd.read_csv('part_1.csv', dtype=types)

In [25]:
part_embs = pd.merge(part_1, frame.drop(['target', 'artist_song'], axis=1),
                     how='left', left_on=['msno', 'song_id'], right_on = ['msno', 'song_id'])

In [26]:
X = part_embs.drop(['target','msno', 'song_id', 'msno_artist', 'msno_genre', 'index',
                    'country','genres_pop', 'country', 'gender', 'language'], axis=1)
y = part_embs['target']

In [33]:
#уже не будем переподбирать параметры сравнимся на тех, что были подобраны в 1-ой части
def k_fold_training(X, y, k=5):
    aucs = []
    kf = StratifiedKFold(n_splits=k, shuffle=True)

    for i, (train_ind, test_ind) in enumerate(kf.split(X, part_embs['msno'])):
        print(f"Folder {i+1}:")
    
        X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, y_test = X.iloc[test_ind], y.iloc[test_ind]
        
        X_train, X_val, y_train, y_tval = train_test_split(X_train, y_train, test_size=0.2, random_state=10, stratify=y_train)

        lgbtrain = lgb.Dataset(X_train, y_train)
        lgbval = lgb.Dataset(X_val, y_tval)


        params = {'objective': 'binary',
                  'learning_rate': 0.27,
                  'metric': 'auc',
                  'max_depth': 10, 
                  'num_leaves': 256, 
                  'lambda_l1': 2.0,
                  'lambda_l2': 2.5}

        gbm = lgb.train(params, 
                          train_set=lgbtrain, 
                          num_boost_round=100, 

                          verbose_eval=25,
                          valid_sets=[lgbtrain, lgbval],)
        y_pred = gbm.predict(X_test)
        auc = roc_auc_score(y_test, y_pred)
        print(auc)
        aucs.append(auc)
   
    print(f'Mean AUC_ROC: {np.mean(aucs):.3f} with {k} folds')

In [34]:
k_fold_training(X, y)



Folder 1:
[LightGBM] [Info] Number of positive: 2376403, number of negative: 2345144
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6355
[LightGBM] [Info] Number of data points in the train set: 4721547, number of used features: 21




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503310 -> initscore=0.013241
[LightGBM] [Info] Start training from score 0.013241
[25]	training's auc: 0.792832	valid_1's auc: 0.783156
[50]	training's auc: 0.802695	valid_1's auc: 0.788673
[75]	training's auc: 0.81002	valid_1's auc: 0.792416
[100]	training's auc: 0.816749	valid_1's auc: 0.795604
0.7950830382067704
Folder 2:
[LightGBM] [Info] Number of positive: 2377393, number of negative: 2344154
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6327
[LightGBM] [Info] Number of data points in the train set: 4721547, number of used features: 21




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503520 -> initscore=0.014080
[LightGBM] [Info] Start training from score 0.014080
[25]	training's auc: 0.793437	valid_1's auc: 0.782486
[50]	training's auc: 0.803673	valid_1's auc: 0.788359
[75]	training's auc: 0.81123	valid_1's auc: 0.792229
[100]	training's auc: 0.81641	valid_1's auc: 0.794382
0.7944390120304488
Folder 3:
[LightGBM] [Info] Number of positive: 2378097, number of negative: 2343450
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6401
[LightGBM] [Info] Number of data points in the train set: 4721547, number of used features: 21




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503669 -> initscore=0.014676
[LightGBM] [Info] Start training from score 0.014676
[25]	training's auc: 0.794379	valid_1's auc: 0.783358
[50]	training's auc: 0.804251	valid_1's auc: 0.788754
[75]	training's auc: 0.810564	valid_1's auc: 0.791815
[100]	training's auc: 0.815114	valid_1's auc: 0.793526
0.7930116659601798
Folder 4:
[LightGBM] [Info] Number of positive: 2377281, number of negative: 2344267
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6339
[LightGBM] [Info] Number of data points in the train set: 4721548, number of used features: 21




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503496 -> initscore=0.013985
[LightGBM] [Info] Start training from score 0.013985
[25]	training's auc: 0.79443	valid_1's auc: 0.782945
[50]	training's auc: 0.803582	valid_1's auc: 0.787912
[75]	training's auc: 0.810582	valid_1's auc: 0.791412
[100]	training's auc: 0.816204	valid_1's auc: 0.793949
0.7942354398990903
Folder 5:
[LightGBM] [Info] Number of positive: 2377725, number of negative: 2343823
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6427
[LightGBM] [Info] Number of data points in the train set: 4721548, number of used features: 21




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503590 -> initscore=0.014361
[LightGBM] [Info] Start training from score 0.014361
[25]	training's auc: 0.793948	valid_1's auc: 0.783415
[50]	training's auc: 0.803816	valid_1's auc: 0.78891
[75]	training's auc: 0.810728	valid_1's auc: 0.792304
[100]	training's auc: 0.817861	valid_1's auc: 0.79604
0.7960176247623457
Mean AUC_ROC: 0.795 with 5 folds


In [35]:
#УРА! добавление эмбеддингов помогло увеличить скор с 0.748 до 0.795, что говорит о том, что это супер важная фича!