In [162]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import  StratifiedKFold, GroupShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.metrics import make_scorer, ndcg_score, dcg_score
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from lightgbm import LGBMRanker
import lightgbm as lgb

In [163]:
train_df = pd.read_csv('train_df.csv').drop_duplicates()
test_df = pd.read_csv('test_df.csv').drop_duplicates()

scaler = StandardScaler()
features = train_df.drop(['search_id', 'target'], axis=1)
features_test = test_df.drop(['search_id', 'target'], axis=1)
features_norm = scaler.fit_transform(features)
df_norm = pd.DataFrame(features_norm, columns=features.columns)
features_test_norm = scaler.fit_transform(features_test)
df_test_norm = pd.DataFrame(features_test_norm, columns=features_test.columns)

X_train = df_norm
Y_train = train_df['target']
group_train = train_df.groupby('search_id').size().to_numpy()

X_test = df_test_norm
Y_test = test_df['target']
group_test = test_df.groupby('search_id').size().to_numpy()

ranker = LGBMRanker(n_estimators = 200,
                         learning_rate = 0.1,
                         random_state = 33,
                         n_jobs = -1,
                         colsample_bytree= 0.844,
                         max_depth= 62,
                         min_child_samples= 1500,
                         min_child_weight=0.00415,
                         min_split_gain= 0.0279,
                         num_leaves= 256,
                         reg_alpha= 0.3605,
                         reg_lambda= 0.4198,
                         subsample= 0.2429)
early_stopping_callback = lgb.early_stopping(30, first_metric_only=True, verbose=True)

ranker.fit(
    X_train, Y_train, 
    group=group_train, 
    eval_set=[(X_test, Y_test)], 
    eval_group=[group_test], 
    eval_at=[1, 5, 10, 20, 30], 
    eval_metric = 'ndcg',
    callbacks=[early_stopping_callback],
)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11705
[LightGBM] [Info] Number of data points in the train set: 15069, number of used features: 68
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[10]	valid_0's ndcg@1: 0.87	valid_0's ndcg@5: 0.902341	valid_0's ndcg@10: 0.919737	valid_0's ndcg@20: 0.92519	valid_0's ndcg@30: 0.92519
Evaluated only: ndcg@1


In [164]:
df_test_ = pd.read_csv('test_df.csv')
df_test_unique = df_test_.drop_duplicates()
features_t = df_test_unique.drop(['search_id', 'target'], axis=1)
features_norm_t = scaler.fit_transform(features_t)
df_t_norm = pd.DataFrame(features_norm_t, columns=features_t.columns)

X_test_t = df_t_norm
Y_pred = ranker.predict(X_test_t)

df_test_unique['predicted_score'] = Y_pred

df_test_unique.sort_values(by=['search_id', 'predicted_score'], ascending=[True, False], inplace=True)


ndcg_scores = []

for group_id, group in df_test_unique.groupby('search_id'):
    if len(group) <= 1:
        continue
    true_relevance = group['target'].values
    predicted_scores = group['predicted_score'].values
    ndcg = ndcg_score([true_relevance], [predicted_scores], k=20)
    if ndcg > 0.0:
        ndcg_scores.append(ndcg)

average_ndcg = np.mean(ndcg_scores)
print(f'Average NDCG score (for all documents): {average_ndcg}')

Average NDCG score (for all documents): 0.6541264559427336


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_unique['predicted_score'] = Y_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_unique.sort_values(by=['search_id', 'predicted_score'], ascending=[True, False], inplace=True)
