In [1]:
import pandas as pd
import numpy as np

import lightgbm
from sklearn.metrics import ndcg_score

import warnings
warnings.filterwarnings("ignore")



In [2]:
df = pd.read_csv('intern_task.csv')
df

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.000000,0.454545,0.890238,8.655534,1.000000,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.000000,0.0,1.000000,...,0.0,0.000000,0.000000,0.773976,23.130514,0.000000,0.027826,0.000430,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.000000,0.0,0.666667,...,0.0,0.000000,0.000000,0.918308,13.351339,0.000000,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.000000,0.0,1.000000,...,0.0,0.000000,0.000000,0.975355,18.240926,0.000000,0.053140,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.000000,0.0,1.000000,...,273.0,79.670665,0.200000,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235253,2,29995,1.0,0.0,0.0,0.0,1.0,0.500000,0.0,0.000000,...,0.0,0.000000,0.000000,0.471409,0.000000,0.000000,0.001350,0.000002,3.0,1.500000
235254,2,29995,1.0,0.0,1.0,0.0,1.0,0.500000,0.0,0.500000,...,0.0,0.000000,0.000000,0.471409,39.908056,0.000000,0.004850,0.000014,9.0,4.500000
235255,1,29995,1.0,0.0,0.0,0.0,1.0,0.500000,0.0,0.000000,...,0.0,0.000000,0.000000,0.471409,0.000000,0.000000,0.001064,0.000001,1.0,0.500000
235256,2,29995,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [3]:
df['rank'].unique()

array([0, 1, 2, 4, 3], dtype=int64)

In [4]:
len(df['query_id'].unique())

2000

In [5]:
# минимально допустимое количество документов внутри одной сессии
min_docs_in_query = 5

# минимально допустимый разброс значений rank внутри одной сессии
min_range_in_rank = 1

In [6]:
query_lens = df.groupby('query_id')['rank'].apply(list).apply(len)
print(f'Сессии для которых имеется менее {min_docs_in_query} документов:')
query_lens[query_lens < min_docs_in_query]

Сессии для которых имеется менее 5 документов:


query_id
5920     3
8665     4
9265     1
10525    4
11410    3
14350    2
20560    1
22780    2
23215    4
25120    4
25885    3
26170    4
26395    4
26545    4
28285    1
Name: rank, dtype: int64

In [7]:
# удаление сессий для которых имеется менее 5 документов
queris_less_min = set(query_lens[query_lens < min_docs_in_query].index)
df = df[~df['query_id'].isin(queris_less_min)]

In [8]:
query_range = df.groupby('query_id')['rank'].apply(list).apply(lambda x: max(x) - min(x))
print(f'Сессии для которых разница между максимальным и минимальным значениями rank менее чем {min_range_in_rank}:')
query_range[query_range < min_range_in_rank].index

Сессии для которых разница между максимальным и минимальным значениями rank менее чем 1:


Index([  355,  4765,  6235,  7195,  9475, 10045, 10300, 11995, 12280, 12655,
       13315, 14455, 15235, 15310, 15385, 15640, 15655, 16300, 16705, 16825,
       16855, 16885, 17095, 17590, 19180, 19315, 19720, 19930, 20020, 20395,
       20815, 20980, 20995, 21070, 21565, 21970, 22435, 25585, 25990, 26890,
       26905, 27100, 27370, 27700, 27790, 28345, 29260, 29740],
      dtype='int64', name='query_id')

In [9]:
# удаление сессий для которых разница между максимальным и минимальным
#                           значениями rank менее чем min_range_in_rank

queris_same_rank = set(query_range[query_range < min_range_in_rank].index)
df = df[~df['query_id'].isin(queris_same_rank)]

In [10]:
# отбор признаков, значимость которых выше чем у случайного признака
df['random'] = np.random.rand(len(df))

model = lightgbm.LGBMRanker(
    objective='lambdarank'
)

model.fit(
    X=df.drop(['query_id', 'rank'], axis=1),
    y=df['rank'],
    group=df.groupby('query_id')['query_id'].count().to_numpy()
)

feature_importance = model.feature_importances_
feature_names = model.feature_name_

importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance}).\
                                sort_values(by='importance', ascending=False)

random_importance = importance_df[importance_df['feature'] == 'random']['importance'].item()
selected_features = list(importance_df[importance_df['importance'] > random_importance]['feature'])
print(f'\n\nВыбрано {len(selected_features)} из {len(df.columns) - 2} признаков')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26706
[LightGBM] [Info] Number of data points in the train set: 233805, number of used features: 141


Выбрано 21 из 145 признаков


In [11]:
df = df[['rank', 'query_id'] + selected_features]
df

Unnamed: 0,rank,query_id,feature_129,feature_132,feature_13,feature_107,feature_128,feature_130,feature_126,feature_133,...,feature_10,feature_109,feature_127,feature_125,feature_108,feature_135,feature_122,feature_49,feature_8,feature_116
0,0,10,153.0,104.0,11.0,7.738967,0.0,3866.0,76.0,0.0,...,10.0,22.927740,0.0,3.0,26.350077,0.000000,-20.421283,0.233333,0.000000,-24.810069
1,1,10,266.0,2.0,11.0,21.651734,9.0,56137.0,73.0,0.0,...,557.0,24.339780,0.0,3.0,0.000000,0.000000,-5.459244,0.083478,0.004251,-24.810069
2,0,10,541.0,11.0,8.0,15.027157,0.0,12621.0,54.0,0.0,...,522.0,21.160657,8.0,4.0,0.000000,0.000000,-12.939705,0.044776,0.008501,-24.810069
3,1,10,14687.0,3.0,5.0,22.468922,0.0,40205.0,36.0,0.0,...,59.0,24.653968,6.0,2.0,0.000000,0.000000,-5.143084,0.159420,0.012752,-24.810069
4,2,10,10577.0,1.0,5.0,26.522979,1.0,34605.0,21.0,2175.0,...,203.0,26.353970,1896.0,2.0,12.696249,79.670665,-4.766107,0.139535,0.017003,-24.810069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235253,2,29995,5131.0,1.0,9.0,0.000000,62.0,65535.0,36.0,0.0,...,1091.0,16.290441,103.0,3.0,0.000000,0.000000,-2.670310,0.002700,999.982997,-7.602780
235254,2,29995,1940.0,4.0,11.0,17.924727,2.0,54880.0,58.0,0.0,...,1210.0,25.834071,428.0,4.0,0.000000,0.000000,-1.779337,0.009701,999.987248,-7.602780
235255,1,29995,6135.0,4.0,12.0,0.000000,27.0,51819.0,61.0,0.0,...,439.0,12.740497,24242.0,6.0,0.000000,0.000000,-2.670310,0.002128,999.991499,-7.602780
235256,2,29995,3121.0,6.0,9.0,0.000000,0.0,61234.0,42.0,0.0,...,0.0,0.000000,100.0,4.0,0.000000,0.000000,-2.670310,0.000000,999.995749,-7.602780


In [12]:
# разбиение на train и val
unique_queries = df['query_id'].unique()
np.random.shuffle(unique_queries)


split_index = int(len(unique_queries) * 0.75)
train_queries = set(unique_queries[:split_index])
val_queries = set(unique_queries[split_index:])


train_df = df[df['query_id'].isin(train_queries)]
val_df = df[df['query_id'].isin(val_queries)]

In [13]:
qids_train = train_df.groupby('query_id')['query_id'].count().to_numpy()
X_train = train_df.drop(['query_id', 'rank'], axis=1)
y_train = train_df['rank']

qids_val = val_df.groupby('query_id')['query_id'].count().to_numpy()
X_val = val_df.drop(['query_id', 'rank'], axis=1)
y_val = val_df['rank']

In [14]:
model = lightgbm.LGBMRanker(
    objective='lambdarank',
    n_estimators=12,
    num_leaves=15
)

model.fit(
    X=X_train,
    y=y_train,
    group=qids_train
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4751
[LightGBM] [Info] Number of data points in the train set: 175641, number of used features: 21


In [15]:
train_df['preds'] = model.predict(X_train)
val_df['preds'] = model.predict(X_val)

sorted_train = train_df.sort_values('rank', ascending=False)
top_ranks_train = list(sorted_train.groupby('query_id')['rank'].apply(list).apply(lambda x: x[:5]))
preds_per_top_ranks_train = list(sorted_train.groupby('query_id')['preds'].apply(list).apply(lambda x: x[:5]))

sorted_val = val_df.sort_values('rank', ascending=False)
top_ranks_val = list(sorted_val.groupby('query_id')['rank'].apply(list).apply(lambda x: x[:5]))
preds_per_top_ranks_val = list(sorted_val.groupby('query_id')['preds'].apply(list).apply(lambda x: x[:5]))

print(f'train ndcg@5: {ndcg_score(top_ranks_train, preds_per_top_ranks_train, k=5):.5f}')
print(f'val ndcg@5: {ndcg_score(top_ranks_val, preds_per_top_ranks_val, k=5):.5f}')

train ndcg@5: 0.96176
val ndcg@5: 0.96172
