In [None]:
!pip install catboost

In [103]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.metrics import ndcg_score

In [104]:
train_df = pd.read_csv('train_df.csv')

In [105]:
train_df

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,758,9,0,0,1,20,3,40,0,3,...,0.204682,0.271755,0.055623,0,0,0,0.38648,0.0,0.0,0
1,758,9,0,0,1,20,3,40,0,3,...,0.195531,0.188787,0.036914,0,0,0,0.10982,0.0,0.0,0
2,758,9,0,0,1,20,3,40,0,3,...,0.148609,0.186517,0.027718,0,0,0,0.03674,0.0,0.0,0
3,758,9,0,0,1,20,3,40,0,3,...,0.223748,0.229039,0.051247,0,0,0,0.00000,0.0,0.0,0
4,758,9,0,0,1,20,3,40,0,3,...,0.170935,0.249031,0.042568,0,0,0,0.00000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15076,494693,9,0,0,0,9,4,38,6,6,...,0.309672,0.921060,0.285226,0,0,0,0.98807,0.0,0.0,0
15077,494693,9,0,0,0,9,4,38,6,6,...,0.303805,0.995086,0.302312,0,0,0,0.87146,0.0,0.0,0
15078,494693,9,0,0,0,9,4,38,6,6,...,0.346538,0.993070,0.344137,0,0,0,0.49999,0.0,0.0,0
15079,494693,9,0,0,0,9,4,38,6,6,...,0.243154,0.994833,0.241898,0,0,0,0.67614,0.0,0.0,0


In [106]:
test_df = pd.read_csv('test_df.csv')

In [107]:
test_df

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,10655,9,0,0,1,20,4,40,0,0,...,0.148830,0.196644,0.029267,0,0,0,0.03674,0.0,0.0,0
1,10655,9,0,0,1,20,4,40,0,0,...,0.119724,0.174199,0.020856,0,0,0,0.00000,0.0,0.0,0
2,10655,9,0,0,1,20,4,40,0,0,...,0.160606,0.198780,0.031925,0,0,0,0.00000,0.0,0.0,0
3,10655,9,0,0,1,20,4,40,0,0,...,0.180191,0.187882,0.033855,0,0,0,0.00000,0.0,0.0,0
4,10655,9,0,0,1,20,4,40,0,0,...,0.117308,0.153586,0.018017,0,0,0,0.00000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,493078,9,0,0,0,9,4,35,0,0,...,0.341683,0.067348,0.023012,0,0,0,0.46108,0.0,0.0,0
1525,493078,9,0,0,0,9,4,35,0,0,...,0.270293,0.049000,0.013244,0,0,0,0.03674,0.0,0.0,0
1526,493078,9,0,0,0,9,4,35,0,0,...,0.372268,0.069882,0.026015,0,0,0,0.14540,0.0,0.0,1
1527,493078,9,0,0,0,9,4,35,0,0,...,0.355755,0.077469,0.027560,0,0,0,0.21288,0.0,0.0,1


# Обучение без обработки данных

**Используем catboost**

In [108]:
groups = train_df['search_id'].unique(); len(groups)

1000

In [109]:
X_train = train_df.drop(columns=['search_id', 'target']).values
y_train = train_df['target'].values
groups_train = train_df['search_id'].values

X_test = test_df.drop(columns=['search_id', 'target']).values
y_test = test_df['target'].values
groups_test = test_df['search_id'].values

In [110]:
parameters = {
    'iterations': 120,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=5'],
    'verbose': False,
    'random_seed': 0,
    'early_stopping_rounds' : 100,
    'loss_function': 'YetiRankPairwise',
    'depth': 3,
    'learning_rate': 0.1
}

In [111]:
model = CatBoostRanker(**parameters)
model = model.fit(X_train, y_train, group_id=groups_train)

In [112]:
ndcg_train = model.score(X_train, y_train, group_id=groups_train)
print(f"Train NDCG: {ndcg_train}")

ndcg_test = model.score(X_test, y_test, group_id=groups_test)
print(f"Test NDCG: {ndcg_test}")

Train NDCG: 0.9274772630239582
Test NDCG: 0.9257640240734762


# Обучение с обработкой данных

In [113]:
to_drop = train_df.nunique().index[train_df.nunique() == 1]
train_df = train_df.drop(columns = to_drop, axis =1)
test_df = test_df.drop(columns = to_drop, axis =1)

In [114]:
to_drop

Index(['feature_0', 'feature_73', 'feature_74', 'feature_75'], dtype='object')

In [116]:
def remove_highly_correlated_features(df, df_test, threshold):
    corr_matrix = df.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
    df_filtered = df.drop(to_drop, axis=1)
    df_filtered_test = df_test.drop(to_drop, axis=1)
    print(to_drop)
    return df_filtered, df_filtered_test

threshold = 0.9

train_df, test_df = remove_highly_correlated_features(train_df, test_df, threshold)

print("Количество признаков после удаления коррелирующих:", test_df.shape[1])

['feature_4', 'feature_60', 'feature_63', 'feature_65', 'feature_72', 'feature_78']
Количество признаков после удаления коррелирующих: 71


In [124]:
X_train = train_df.drop(columns=['search_id', 'target']).values
y_train = train_df['target'].values
groups_train = train_df['search_id'].values

X_test = test_df.drop(columns=['search_id', 'target']).values
y_test = test_df['target'].values
groups_test = test_df['search_id'].values

In [125]:
model = CatBoostRanker(**parameters)
model = model.fit(X_train, y_train, group_id=groups_train)

In [126]:
ndcg_train = model.score(X_train, y_train, group_id=groups_train)
print(f"Train NDCG: {ndcg_train}")

ndcg_test = model.score(X_test, y_test, group_id=groups_test)
print(f"Test NDCG: {ndcg_test}")

Train NDCG: 0.9281688453531405
Test NDCG: 0.908842357475919


**Скор ухудшился на тех же параметрах**