In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")


In [2]:
df_src = pd.read_csv('data_mini.csv', index_col=1, parse_dates={'datetime': ['Date']})
df = df_src.copy()
print(df.shape)
df.head(2)

(344034, 7)


Unnamed: 0,datetime,Position,Track Name,Artist,Streams,URL,Region
0,2017-04-10,1,Shape of You,Ed Sheeran,4721,https://open.spotify.com/track/7qiZfU4dY1lWllz...,sk
1,2017-09-17,1,Mi Gente,J Balvin,202918,https://open.spotify.com/track/2rb5MvYT7ZIxbKW...,ar


In [3]:
df.Position.max()

200

## Признаки:
- запрос-независимые - Track Name, Artist, Streams
- запросовые - Date
- запросо-зависимые - Национальность Artist соотв региону запроса, или Язык, 
на котором записан Title матчится с регионом запроса

# Предобработка данных

In [4]:
df.isna().sum()

datetime       0
Position       0
Track Name    67
Artist        67
Streams        0
URL            1
Region         0
dtype: int64

In [5]:
df = df.dropna()
df.shape

(343967, 7)

### Postion preprocessing
`Делаем важное предположение: Позиция в чартах (Position) является мерой релеватности`

In [6]:
from utils import bin

<img src="bin.webp" width="400">


In [7]:
df['label'] = df.Position.apply(bin)
df = df.drop('Position', axis=1)
df.head(2)

Unnamed: 0,datetime,Track Name,Artist,Streams,URL,Region,label
0,2017-04-10,Shape of You,Ed Sheeran,4721,https://open.spotify.com/track/7qiZfU4dY1lWllz...,sk,10
1,2017-09-17,Mi Gente,J Balvin,202918,https://open.spotify.com/track/2rb5MvYT7ZIxbKW...,ar,10


### Track Name preprocessing


In [8]:
from category_encoders.hashing import HashingEncoder



In [9]:
y = df.label

he = HashingEncoder(cols=['Track Name'], verbose=1).fit(df, y)

df = he.transform(df)

In [10]:
df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,datetime,Artist,Streams,URL,Region,label
0,1,0,0,0,0,0,0,0,2017-04-10,Ed Sheeran,4721,https://open.spotify.com/track/7qiZfU4dY1lWllz...,sk,10
1,0,0,0,1,0,0,0,0,2017-09-17,J Balvin,202918,https://open.spotify.com/track/2rb5MvYT7ZIxbKW...,ar,10


### Artist - имя исполнителя или группы
`Leave One Out technique` - заменяем категориальный признак средним значением целевой (label)
переменной, соотв. данныму конкрентому значению категориального признака

<img src="loo.webp" width="400">


In [11]:
from category_encoders.leave_one_out import LeaveOneOutEncoder


In [12]:
y = df.label

loo = LeaveOneOutEncoder(cols=['Artist'], verbose=1).fit(df, y)

df = loo.transform(df)

### URL 
Здесь важно знать внутрянку - по факту Url соотв. уникальной паре песня-исполниель, информация о которых уже представлена в полях Track Name и Artist 

In [13]:
df = df.drop('URL', axis=1)

### Date
Логично разбить дату на дни, месяцы и годы. Например для учета сезонности

In [14]:
df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,datetime,Artist,Streams,Region,label
0,1,0,0,0,0,0,0,0,2017-04-10,2.470445,4721,sk,10
1,0,0,0,1,0,0,0,0,2017-09-17,3.50915,202918,ar,10


In [15]:
df['year'] = df.datetime.dt.year
df['month'] = df.datetime.dt.month
df['day'] = df.datetime.dt.day
df = df.drop('datetime', axis=1)
df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,Artist,Streams,Region,label,year,month,day
0,1,0,0,0,0,0,0,0,2.470445,4721,sk,10,2017,4,10
1,0,0,0,1,0,0,0,0,3.50915,202918,ar,10,2017,9,17


### Region
Очень важный момент, какой признак мы будем считать как query - запрос.
В случае данных, которые предоставил Spotify таким признаком является регион пользователя.
Логично сделать такой признак запросом

In [16]:
queries, queries_unique = pd.factorize(df.Region)
df['queries'] = queries
df = df.drop(['Region'], axis=1)

In [17]:
df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,Artist,Streams,label,year,month,day,queries
0,1,0,0,0,0,0,0,0,2.470445,4721,10,2017,4,10,0
1,0,0,0,1,0,0,0,0,3.50915,202918,10,2017,9,17,1


## Тренировка модели

In [18]:
from utils import split_train_test

import xgboost


In [19]:
train, test = split_train_test(df, p=0.8)
print(train.shape)
print(test.shape)

100%|██████████| 589/589 [00:10<00:00, 58.69it/s]

(274943, 15)
(69024, 15)





In [20]:
X_train = train[train.columns.difference(['queries','label'])]
train_query_groups = train.queries.value_counts(sort=False)
y_train = train.label

X_test = test[train.columns.difference(['queries','label'])]
test_query_groups = test.queries.value_counts(sort=False)
y_test = test.label

In [21]:
training_xgb_matrix = xgboost.DMatrix(X_train, label=y_train)
training_xgb_matrix.set_group(train_query_groups)
test_xgb_matrix = xgboost.DMatrix(X_test, label=y_test)
test_xgb_matrix.set_group(test_query_groups)


In [79]:
eval_metrics = ['ndcg@10', 'map@10', 'pre@10']

params = {'objective': 'rank:ndcg', 'eval_metric': eval_metrics, 'verbosity': 0}
watch_list = [(training_xgb_matrix, 'train'), (test_xgb_matrix, 'eval')]

xgb_model = xgboost.train(params, training_xgb_matrix, num_boost_round=100, evals=watch_list)
predictions = xgb_model.predict(test_xgb_matrix)

[0]	train-ndcg@10:0.89254	train-map@10:0.29851	train-pre@10:0.68519	eval-ndcg@10:0.89722	eval-map@10:0.30468	eval-pre@10:0.68519
[1]	train-ndcg@10:0.89254	train-map@10:0.29851	train-pre@10:0.68519	eval-ndcg@10:0.89722	eval-map@10:0.30468	eval-pre@10:0.68519


In [23]:
test['label_predicted'] = predictions

In [74]:
query = 20
cols = [
    'datetime', 'Track Name',
    'URL', 'Region', 'label_predicted',
    'label', 'Position'
]
df_src.loc[test[test.queries == query].index, :] \
    .merge(test[test.queries == query], left_index=True, right_index=True)[cols] \
    .sort_values('label_predicted', ascending=False)

Unnamed: 0,datetime,Track Name,URL,Region,label_predicted,label,Position
1932,2017-08-24,More Than You Know,https://open.spotify.com/track/3PEgB3fkiojxms3...,cz,0.519738,10,1
94457,2017-10-06,It Ain't Me (with Selena Gomez),https://open.spotify.com/track/3eR23VReFzcdmS7...,cz,0.519738,3,49
72767,2017-11-19,Man's Not Hot,https://open.spotify.com/track/2nUJvBO87SkxCVi...,cz,0.519738,3,38
94803,2017-02-11,Chantaje,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,cz,0.519738,3,49
91418,2017-09-30,Despacito - Remix,https://open.spotify.com/track/5CtI0qwDJkDQGwX...,cz,0.519738,3,47
...,...,...,...,...,...,...,...
294703,2017-03-10,Numb,https://open.spotify.com/track/2nLtzopw4rPResz...,cz,0.479730,0,167
299428,2017-05-26,Ciao Adios,https://open.spotify.com/track/3EfugazgSddQvzZ...,cz,0.479730,0,170
260869,2017-04-23,By Your Side,https://open.spotify.com/track/1D3ODoXHBLpdxol...,cz,0.479730,0,145
240558,2017-06-14,History,https://open.spotify.com/track/0WCEaydwN65cvwa...,cz,0.479730,0,132
