In [4]:
!pip install imblearn catboost -q
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import Counter
from sklearn import metrics
from sklearn.metrics import ndcg_score
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [5]:
train = pd.read_csv('data/train_df.csv')
test = pd.read_csv('data/test_df.csv')
print(len(train))
train.sample(5)

15081


Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
7638,279529,9,0,0,1,20,4,14,0,0,...,0.188362,0.483445,0.091063,0,0,0,0.07308,0.0,0.0,0
11126,112347,9,0,0,1,20,4,32,0,0,...,0.421172,0.845054,0.355913,0,0,0,0.49997,0.393772,0.504774,0
6051,128030,9,0,0,1,20,4,26,6,3,...,0.265745,0.536045,0.142451,0,0,0,0.07348,0.0,0.0,0
12123,212842,9,0,0,0,9,3,46,0,0,...,0.219178,0.188343,0.04128,0,0,0,0.81339,0.0,0.0,0
549,70958,9,0,0,1,20,4,26,0,0,...,0.21691,0.329615,0.071497,0,0,0,0.85404,0.0,0.0,0


In [6]:
print(len(test))
test.sample(5)

1529


Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
1425,388365,9,0,0,0,9,4,26,0,0,...,0.232539,0.982801,0.228539,0,0,0,0.53669,0.0,0.0,0
751,321122,9,0,0,1,20,3,45,0,1,...,0.304524,0.660256,0.201064,0,0,0,0.48646,0.0,0.0,0
907,371907,9,0,0,1,20,4,34,0,3,...,0.172906,0.473143,0.081809,0,0,0,0.0,0.0,0.0,0
805,342632,9,2,0,1,20,4,25,4,4,...,0.18173,0.595006,0.108131,0,0,0,0.29001,0.0,0.0,0
475,485825,9,0,0,0,9,3,32,0,5,...,0.131499,0.1061,0.013952,0,0,0,0.0,0.0,0.0,0


In [7]:
numerical_columns_train = train.drop(['search_id', 'target'], axis = 1).select_dtypes(include=[np.number]).columns
numerical_columns_test = test.drop(['search_id', 'target'], axis = 1).select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
train[numerical_columns_train] = scaler.fit_transform(train.drop(['search_id', 'target'], axis = 1))
test[numerical_columns_test] = scaler.fit_transform(test.drop(['search_id', 'target'], axis = 1))
X_train = train.drop(columns = ['search_id', 'target'])
y_train = train['target']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)
X_test = test.drop(columns = ['search_id', 'target'])
y_test = test['target']

In [8]:
counter = Counter(y_test)
counter

Counter({0: 1495, 1: 34})

In [14]:
best_model = CatBoostClassifier(iterations = 100, learning_rate = 0.1, depth = 15, loss_function = 'Logloss', random_state = 42)
best_model.fit(X_train, y_train, eval_set = (X_val, y_val), verbose = False)

<catboost.core.CatBoostClassifier at 0x2535e89f820>

In [15]:
y_pred_test = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)
print("Accuracy_train:", metrics.accuracy_score(y_train, y_pred_train))
print("Accuracy_test:", metrics.accuracy_score(y_test, y_pred_test))
print("Precision_train:", metrics.precision_score(y_train, y_pred_train, average='macro', zero_division = 1))
print("Precision_test:", metrics.precision_score(y_test, y_pred_test, average='macro', zero_division = 1))
print("Recall_train:", metrics.recall_score(y_train, y_pred_train, average='macro'))
print("Recall_test:", metrics.recall_score(y_test, y_pred_test, average='macro'))
print("F1-score_train:", metrics.f1_score(y_train, y_pred_train, average='macro'))
print("F1-score_test:", metrics.f1_score(y_test, y_pred_test, average='macro'))

Accuracy_train: 0.9988395225464191
Accuracy_test: 0.9777632439502943
Precision_train: 0.9994080338266385
Precision_test: 0.9888816219751472
Recall_train: 0.9723320158102766
Recall_test: 0.5
F1-score_train: 0.9854761993211403
F1-score_test: 0.49437830687830686


In [16]:
test['y_pred_test'] = y_pred_test

In [17]:
unique_search_ids = test['search_id'].unique()
df = pd.DataFrame([], columns = ['search_id', 'ndcg'])

for i, search_id in enumerate(unique_search_ids):
    subset = test[test['search_id'] == search_id]
    if len(subset) < 2:
        continue
    ndcg = ndcg_score([subset['target']], [subset['y_pred_test']])
    df.loc[i] = [str(search_id), ndcg]
df

Unnamed: 0,search_id,ndcg
0,10655,0.000000
1,13321,0.000000
2,75931,0.000000
3,86207,0.000000
4,92013,0.365623
...,...,...
95,453825,0.000000
96,461744,0.000000
97,462834,0.000000
98,488362,0.000000


In [18]:
df.to_csv('ndcgs.csv', index = False)