Решение соревнования https://cleancodecup.ru/problem/promopurchase

In [34]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_log_error, accuracy_score
import optuna
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Binarizer
from imblearn.under_sampling import CondensedNearestNeighbour
from tqdm import tqdm

In [169]:
cls_oct = pd.read_csv('clients_promo_october.csv')
mobile_events = pd.read_csv('mobile_events.csv')
orders = pd.read_csv('orders.csv')
train = pd.read_csv('train_target.csv')
test = pd.read_csv('test.csv')

In [274]:
def get_df(train, orders, mobile_events, cls_oct, test=False):
    
    ord = orders[['ClientUUId', 'OrderUUId', 'OrderTotalPrice', 'apply_promo']].drop_duplicates()
    nn = pd.read_csv('new_col.csv') #Датасет, агреггированный с clients_promo_october.csv
    
    sas = pd.merge(ord.groupby('ClientUUId')[['OrderTotalPrice', 'apply_promo']].sum().reset_index(), nn, on='ClientUUId')
    sas = pd.merge(ord.ClientUUId.value_counts(), sas, on='ClientUUId')

    oks = pd.get_dummies(cls_oct.OrderType + ',' + cls_oct.Discount.apply(lambda x: str(x)) + ',' + cls_oct.OrderPrice.apply(lambda x: str(x)), dtype=int)
    oks = pd.concat((cls_oct.ClientUUId, oks), axis=1)
    oks = oks.groupby('ClientUUId').sum().reset_index()

    new_train = train.drop(['LocalBeginDate', 'LocalEndDate'], axis=1)
    new_train = pd.get_dummies(new_train, columns=['Id', 'OrderType'], dtype=int)
    mobile_events_ = pd.concat((mobile_events, pd.get_dummies(mobile_events.EventName, dtype=int)), axis=1).drop(['EventName'], axis=1)
    ev1 = mobile_events_.groupby(['ClientUUId'])[['apply_personal_offer', 'create_order', 'open_bonusaction']].sum().reset_index()
    ev2 = mobile_events_.groupby(['ClientUUId'])[['apply_personal_offer', 'create_order', 'open_bonusaction']].mean().reset_index()
    ev3 = mobile_events_.groupby(['ClientUUId'])[['apply_personal_offer', 'create_order', 'open_bonusaction']].count().reset_index()
    ev = pd.merge(pd.merge(ev1, ev2, on='ClientUUId'), ev3, on='ClientUUId')
    X = pd.merge(new_train, sas, on='ClientUUId', how='left')
    X = pd.merge(X, oks, on='ClientUUId', how='left')
    X = pd.merge(X, ev, on='ClientUUId', how='left').drop(['ClientUUId'], axis=1)
    if not test:
        y = X.apply_promo_x
        X = X.drop(['apply_promo_x'], axis=1).fillna(0)
        return X.to_numpy(), y.to_numpy()
    else:
        return X.fillna(0).to_numpy()

Извлекаем признаки из данных. Бинаризуем размеры скидок и порог срабатывания промокода, помимо всего прочего извлекаем из истории предыдущих покупок суммы покупок, основанные на них признаки, а также из истории использования приложением его интерес к промокодам и стремление заказать.

In [246]:
X, y = get_df(train, orders, mobile_events, cls_oct, test=False)
X_test = get_df(test, orders, mobile_events, cls_oct, test=True)

In [276]:
params = {'learning_rate': 0.02585593198584217, 'max_depth': 10, 'subsample': 0.5102954089862447, 'colsample_bytree': 0.5066879444200845, 'gamma': 0.008018887885891779, 
          'reg_alpha': 0.13779374848933656, 'reg_lambda': 2.5602362510574066, 'n_estimators': 987}

model = XGBClassifier(**params)

X1, X2, y1, y2 = train_test_split(X, y, test_size=0.2, random_state=14)
X1_, y1_ = RandomUnderSampler(sampling_strategy='majority', random_state=42).fit_resample(X1, y1)

model.fit(X1_, y1_)
y1_pred = model.predict(X1)
y2_pred = model.predict(X2)
print(roc_auc_score(y1, y1_pred))
print(roc_auc_score(y2, y2_pred))
print(confusion_matrix(y1, y1_pred))
print(confusion_matrix(y2, y2_pred))

0.8589600742804085
0.7661534371843266
[[15464  6076]
 [    0   652]]
[[3874 1500]
 [  33  142]]


In [242]:
def objective(trial):

    params = {
    'objective': 'reg:squarederror',
    'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
    'max_depth': trial.suggest_int('max_depth', 3, 10),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    'gamma': trial.suggest_float('gamma', 0, 5),
    'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
    'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
    }

    model = xgb.train(params, X_train, num_boost_round=trial.suggest_int('n_estimators', 100, 1000))

    y_pred = model.predict(X_test)

    return roc_auc_score(y2, y_pred)

In [None]:
scal = StandardScaler()
X1 = scal.fit_transform(X1)
X_train = xgb.DMatrix(X1, label=y1)
X_test = xgb.DMatrix(X2, label=y2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best Parameters:', study.best_params)
print('Best ROC_AUC:', study.best_value)

Trial 21 finished with value: 0.7630240842150035 and parameters: {'learning_rate': 0.02585593198584217, 'max_depth': 10, 'subsample': 0.5102954089862447, 'colsample_bytree': 0.5066879444200845, 'gamma': 0.008018887885891779, 
          'reg_alpha': 0.13779374848933656, 'reg_lambda': 2.5602362510574066, 'n_estimators': 987}

In [278]:
X, y = get_df(train, orders, mobile_events, cls_oct, test=False)
X_test = get_df(test, orders, mobile_events, cls_oct, test=True)

In [279]:
params = {'learning_rate': 0.02585593198584217, 'max_depth': 10, 'subsample': 0.5102954089862447, 'colsample_bytree': 0.5066879444200845, 'gamma': 0.008018887885891779, 
          'reg_alpha': 0.13779374848933656, 'reg_lambda': 2.5602362510574066, 'n_estimators': 987}

model = XGBClassifier(**params)
X1, y1 = RandomUnderSampler(sampling_strategy='majority', random_state=42).fit_resample(X, y)

In [280]:
model.fit(X1, y1)

In [281]:
pred_test = model.predict_proba(X_test)[:, 1]

In [282]:
result = pd.concat((pd.read_csv('test.csv'), pd.DataFrame(pred_test, columns=['apply_promo'])), axis=1)

In [284]:
result.to_csv('submission.csv', index=False)

Результирующая метрика = 72342558,80

11/37 место