In [80]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

* Importing data + Exploration analysis

In [7]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [164]:
# Import data
import pandas as pd

df_train = pd.read_csv('/content/drive/MyDrive/Тестовое задание на стажировку ML-инженер рекомендательные системы/train_df.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Тестовое задание на стажировку ML-инженер рекомендательные системы/test_df.csv')

# Constant features which make no sense -> remove them
constant_features_train = [col for col in df_train.columns if df_train[col].nunique() == 1]
constant_features_test = [col for col in df_test.columns if df_test[col].nunique() == 1]

if constant_features_train == constant_features_test:
  x_train = df_train.drop(['target'] + constant_features_train, axis=1)
  y_train = df_train['target'].to_numpy()

  df_test = df_test.drop(constant_features_train, axis=1)


train_value_counts = x_train['search_id'].value_counts()
train_dict = {id: train_value_counts.get(id, 0) for id in x_train['search_id'].unique()}
unique_ids_train = x_train['search_id'].unique()

test_value_counts = df_test['search_id'].value_counts()
test_dict = {id: test_value_counts.get(id, 0) for id in df_test['search_id'].unique()}
unique_ids_test = df_test['search_id'].unique()

x_train = x_train.drop('search_id', axis=1)

In [139]:
x_train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_76,feature_77,feature_78
0,0,0,1,20,3,40,0,3,1,0,...,0.054861,0.433005,0.188131,0.0734,0.204682,0.271755,0.055623,0.38648,0.0,0.0
1,0,0,1,20,3,40,0,3,0,0,...,0.853441,0.290734,0.048554,0.0382,0.195531,0.188787,0.036914,0.10982,0.0,0.0
2,0,0,1,20,3,40,0,3,1,0,...,0.8578,0.287074,0.045502,0.151,0.148609,0.186517,0.027718,0.03674,0.0,0.0
3,0,0,1,20,3,40,0,3,1,1,...,0.0,0.469645,0.145031,0.0699,0.223748,0.229039,0.051247,0.0,0.0,0.0
4,0,0,1,20,3,40,0,3,1,1,...,0.0,0.200816,0.061071,0.0382,0.170935,0.249031,0.042568,0.0,0.0,0.0


In [140]:
df_test.head()

Unnamed: 0,search_id,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_76,feature_77,feature_78,target
0,10655,0,0,1,20,4,40,0,0,1,...,0.180215,0.011001,0.0462,0.14883,0.196644,0.029267,0.03674,0.0,0.0,0
1,10655,0,0,1,20,4,40,0,0,1,...,0.156478,0.008737,0.0462,0.119724,0.174199,0.020856,0.0,0.0,0.0,0
2,10655,0,0,1,20,4,40,0,0,1,...,0.20025,0.015134,0.0462,0.160606,0.19878,0.031925,0.0,0.0,0.0,0
3,10655,0,0,1,20,4,40,0,0,1,...,0.214825,0.014288,0.0462,0.180191,0.187882,0.033855,0.0,0.0,0.0,0
4,10655,0,0,1,20,4,40,0,0,1,...,0.160748,0.008197,0.0462,0.117308,0.153586,0.018017,0.0,0.0,0.0,0


In [95]:
# Checking class imbalance

# We can see that we should use scale_pos_weight for predicting positive label
# Thinking, that adding minority class will somehow negativly affect on positioning on training set -> we will use weighting approach
from collections import Counter

print('Train dataset: ', Counter(y_train))
print('Test dataset: ', Counter(df_test['target']))

Train dataset:  Counter({0: 14759, 1: 322})
Test dataset:  Counter({0: 1495, 1: 34})


In [102]:
# Checking missing values

print('Train set: ', sum(x_train.isna().sum()))
print('Test set: ', sum(df_test.isna().sum()))

Train set:  0
Test set:  0


In [165]:
# IQR outliers detection -> replacing with mean values in each feature
for column in x_train.columns:
    Q1 = x_train[column].quantile(0.25)
    Q3 = x_train[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR

    # Calculate mean or median without outliers in the training data
    mean_value = x_train[(x_train[column] >= lower_bound) & (x_train[column] <= upper_bound)][column].mean()

    # Replace outliers in both training and test data
    x_train.loc[(x_train[column] < lower_bound) | (x_train[column] > upper_bound), column] = mean_value
    df_test.loc[(df_test[column] < lower_bound) | (df_test[column] > upper_bound), column] = mean_value

* Approach: XGBClassifier

In [36]:
!pip install imbalanced-learn
!pip install bayesian-optimization
!pip install xgboost

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [174]:
import xgboost as xgb
import numpy as np
from bayes_opt import BayesianOptimization
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

def xgb_bo(max_depth, learning_rate, n_estimators, gamma, subsample, colsample_bytree):

    params = {
        'xgbclassifier__max_depth': int(max_depth),
        'xgbclassifier__learning_rate': learning_rate,
        'xgbclassifier__n_estimators': int(n_estimators),
        'xgbclassifier__gamma': gamma,
        'xgbclassifier__subsample': subsample,
        'xgbclassifier__colsample_bytree': colsample_bytree,
        'xgbclassifier__use_label_encoder': False,
        'xgbclassifier__eval_metric': 'logloss'
    }

    pipe = Pipeline(steps=[
      ('smote', BorderlineSMOTE(random_state=52, kind='borderline-1')),
      ('standardscaler', StandardScaler()),
      ('xgbclassifier', xgb.XGBClassifier(random_state=52))
    ])

    #Train the model
    pipe.set_params(**params)
    pipe.fit(x_train.to_numpy(), y_train)

    x_test = df_test.drop(['target', 'search_id'], axis=1).to_numpy()
    y_test = df_test['target'].to_numpy()
    preds = pipe.predict_proba(x_test)[:, 1]

    auc = roc_auc_score(y_test, preds)
    ndcg = ndcg_score(np.asarray([y_test]), np.asarray([preds]))
    print(auc)

    return ndcg

#Hyperparameter space
params_xgb = {
    'max_depth': (3, 20),
    'learning_rate': (0.01, 0.5),
    'n_estimators': (100, 2000),
    'gamma': (0, 10),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
}

# Initialize Bayesian Optimization
xgb_bo = BayesianOptimization(xgb_bo, params_xgb, random_state=52)
xgb_bo.maximize(init_points=10, n_iter=50)

best = xgb_bo.max['params']
print(best)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
0.7615581349596695
| [0m1        [0m | [0m0.463    [0m | [0m0.9116   [0m | [0m0.2612   [0m | [0m0.1133   [0m | [0m13.51    [0m | [0m286.7    [0m | [0m0.8101   [0m |
0.5974916387959867
| [95m2        [0m | [95m0.4986   [0m | [95m0.5269   [0m | [95m9.607    [0m | [95m0.4904   [0m | [95m11.86    [0m | [95m1.309e+03[0m | [95m0.8824   [0m |
0.7042986425339366
| [0m3        [0m | [0m0.4967   [0m | [0m0.8825   [0m | [0m4.177    [0m | [0m0.3867   [0m | [0m10.19    [0m | [0m1.86e+03 [0m | [0m0.841    [0m |
0.6213948455636435
| [95m4        [0m | [95m0.5529   [0m | [95m0.6842   [0m | [95m8.589    [0m | [95m0.1964   [0m | [95m4.614    [0m | [95m717.3    [0m | [95m0.7076   [0m |
0.7342219161912257
| [0m5        [0m | [0m0.5226   [0m | [0

* So, in this approach we found the best parameters, which maximize nDCG metric in some way.  