Решение соревнования https://cleancodecup.ru/problem/promopurchase

In [223]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_log_error, accuracy_score
import optuna
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Binarizer
from imblearn.under_sampling import CondensedNearestNeighbour

In [273]:
cls_oct = pd.read_csv('clients_promo_october.csv')
mobile_events = pd.read_csv('mobile_events.csv')
orders = pd.read_csv('orders.csv')
train = pd.read_csv('train_target.csv')
test = pd.read_csv('test.csv')

Извлекаем признаки из данных. Бинаризуем размеры скидок и порог срабатывания промокода, помимо всего прочего извлекаем из истории предыдущих покупок суммы покупок, основанные на них признаки, а также из истории использования приложением его интерес к промокодам и стремление заказать.

In [286]:
def get_data(orders, train_, mobile_events_, cls_oct_):
    orders_, train, mobile_events, cls_oct = orders.copy(), train_.copy(), mobile_events_.copy(), cls_oct_.copy()
    
    lst = set(list(train.ClientUUId)) - set(list(orders_.ClientUUId))
    lst = list(lst)

    orders_.Date = pd.to_datetime(orders_.Date).dt.dayofyear
    orders_.drop(['addressId', 'deliverySectorId', 'SaleDate', 
                'UnitUUId', 'ClientOrderNumber', 'ProductUUId', 'MenuPrice', 'OrderPaymentType'], axis=1, inplace=True)

    eg1 = orders_.groupby(['ClientUUId', 'OrderUUId'])[['OrderTotalPrice']].mean().reset_index()
    eg2 = orders_.groupby(['ClientUUId', 'OrderUUId'])[['ProductTotalPrice']].sum().reset_index()
    eg3 = orders_.groupby(['ClientUUId', 'OrderUUId'])[['Date']].mean().astype(int).reset_index()
    eg4 = orders_.groupby(['ClientUUId', 'OrderUUId'])[['OrderType', 'OrderState', 'apply_promo']].mean().astype(int).reset_index()

    new_ord = pd.merge(pd.merge(pd.merge(eg1, eg2, on=['ClientUUId', 'OrderUUId']), eg3, on=['ClientUUId', 'OrderUUId']), eg4, on=['ClientUUId', 'OrderUUId'])
    new_ord.OrderTotalPrice = (new_ord.ProductTotalPrice - new_ord.OrderTotalPrice).astype(int)
    new_ord.ProductTotalPrice = new_ord.ProductTotalPrice.astype(int)

    bins_ = [1299, 1249,  849, 1399,  949, 1149,  999, 1199, 1099, 1349, 1049, 899,  799,  749,  699,  599,  649, -np.inf, np.inf]
    bins_.sort()
    bins_2 = [ 25, 300, 200,  30,  20, 250, 150,  15,  10, 130, 100, -np.inf, np.inf]
    bins_2.sort()

    new_ord.ProductTotalPrice = pd.cut(new_ord.ProductTotalPrice, bins=bins_, labels=False)
    new_ord.OrderTotalPrice = pd.cut(new_ord.OrderTotalPrice, bins=bins_2, labels=False)

    new_ord = pd.concat((new_ord, pd.get_dummies(new_ord.ProductTotalPrice, dtype=int, prefix='ProductTotalPrice')), axis=1).drop(['ProductTotalPrice'], axis=1)
    new_ord = pd.concat((new_ord, pd.get_dummies(new_ord.OrderTotalPrice, dtype=int, prefix='OrderTotalPrice')), axis=1).drop(['OrderTotalPrice'], axis=1)
    new_ord = pd.concat((new_ord, pd.get_dummies(new_ord.OrderState, dtype=int, prefix='OrderState')), axis=1).drop(['OrderState'], axis=1)
    new_ord = pd.concat((new_ord, pd.get_dummies(new_ord.OrderType, dtype=int, prefix='OrderType')), axis=1).drop(['OrderType'], axis=1)

    new_ord.drop(['OrderUUId'], axis=1, inplace=True)
    gh = (new_ord.groupby('ClientUUId').Date.max() - new_ord.groupby('ClientUUId').Date.min()).reset_index()
    new_ord = pd.concat((new_ord.drop(['Date'], axis=1).groupby('ClientUUId').sum().reset_index(), gh.Date), axis=1)
    new_ord = pd.concat((new_ord, new_ord.drop(['Date'], axis=1).groupby('ClientUUId').mean().reset_index().drop(['ClientUUId'], axis=1)), axis=1)

    mobile_events = pd.concat((mobile_events, pd.get_dummies(mobile_events.EventName, dtype=int)), axis=1).drop(['EventName'], axis=1)
    ev1 = mobile_events.groupby(['ClientUUId'])[['apply_personal_offer', 'create_order', 'open_bonusaction']].sum().reset_index()
    ev2 = mobile_events.groupby(['ClientUUId'])[['apply_personal_offer', 'create_order', 'open_bonusaction']].mean().reset_index()
    ev3 = mobile_events.groupby(['ClientUUId'])[['apply_personal_offer', 'create_order', 'open_bonusaction']].count().reset_index()

    ev1 = pd.merge(ev1, ev2, on='ClientUUId')
    ev1 = pd.merge(ev1, ev3, on='ClientUUId')
    csad = cls_oct.OrderPrice.unique().tolist()
    csad.extend([-np.inf, np.inf])
    csad.sort()
    cls_oct.OrderPrice = pd.cut(cls_oct.OrderPrice, bins=csad, labels=False)
    csad = cls_oct.Discount.unique().tolist()
    csad.extend([-np.inf, np.inf])
    csad.sort()
    cls_oct.Discount = pd.cut(cls_oct.Discount, bins=csad, labels=False)

    cls_oct = pd.concat((cls_oct, pd.get_dummies(cls_oct.OrderType, dtype=int, prefix='OrderType')), axis=1).drop(['OrderType'], axis=1)
    cls_oct = pd.concat((cls_oct, pd.get_dummies(cls_oct.OrderPrice, dtype=int, prefix='OrderPrice')), axis=1).drop(['OrderPrice'], axis=1)
    cls_oct = pd.concat((cls_oct, pd.get_dummies(cls_oct.Discount, dtype=int, prefix='Discount')), axis=1).drop(['Discount'], axis=1)

    cls_oct.drop(['LocalBeginDate', 'LocalEndDate'], axis=1, inplace=True)
    cls_oct = pd.concat((cls_oct, pd.get_dummies(cls_oct.Id, dtype=int, prefix='Discount')), axis=1).drop(['Id'], axis=1)
    clss = cls_oct.groupby(['ClientUUId']).sum().reset_index()
    new_train = train.drop(['LocalBeginDate', 'LocalEndDate'], axis=1)
    new_train = pd.get_dummies(new_train, columns=['Id', 'OrderType'], dtype=int)


    if 'apply_promo' in new_train.columns.to_list():
        y = new_train.apply_promo
        new_train = new_train.drop(['apply_promo'], axis=1)
        new_train = pd.merge(new_train, ev1, on='ClientUUId', how='left')
        new_train = pd.merge(new_train, clss, on='ClientUUId', how='left')

        X = pd.merge(new_train, new_ord, on='ClientUUId', how='left')
        X.drop(['ClientUUId'], axis=1, inplace=True)
        
        return X.to_numpy(), y.to_numpy()
    else:
        new_train = pd.merge(new_train, ev1, on='ClientUUId', how='left')
        new_train = pd.merge(new_train, clss, on='ClientUUId', how='left')

        X = pd.merge(new_train, new_ord, on='ClientUUId', how='left')
        X.drop(['ClientUUId'], axis=1, inplace=True)       
        return X.to_numpy()

In [287]:
X, y = get_data(orders, train, mobile_events, cls_oct)

In [275]:
X1, X2, y1, y2 = train_test_split(X, y, test_size=0.2, random_state=14)
model = XGBClassifier()
X1_, y1_ = RandomUnderSampler(sampling_strategy='majority', random_state=42).fit_resample(X1, y1)

model.fit(X1_, y1_)
y1_pred = model.predict(X1)
y2_pred = model.predict(X2)
print(roc_auc_score(y1, y1_pred))
print(roc_auc_score(y2, y2_pred))
print(confusion_matrix(y1, y1_pred))
print(confusion_matrix(y2, y2_pred))

0.7714980262146044
0.7432420649688979
[[15165  6375]
 [  105   547]]
[[3812 1562]
 [  39  136]]


In [123]:
def objective(trial):

    params = {
    'objective': 'reg:squarederror',
    'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
    'max_depth': trial.suggest_int('max_depth', 3, 10),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    'gamma': trial.suggest_float('gamma', 0, 5),
    'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
    'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
    }

    model = xgb.train(params, X_train, num_boost_round=trial.suggest_int('n_estimators', 100, 1000))

    y_pred = model.predict(X_test)

    return roc_auc_score(y2, y_pred)

In [None]:
scal = StandardScaler()
X1 = scal.fit_transform(X1)
X_train = xgb.DMatrix(X1, label=y1)
X_test = xgb.DMatrix(X2, label=y2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best Parameters:', study.best_params)
print('Best ROC_AUC:', study.best_value)

Trial 21 finished with value: 0.7630240842150035 and parameters: {'learning_rate': 0.0721584366956849, 'max_depth': 5, 'subsample': 0.5891893757533981, 'colsample_bytree': 0.81349156603394, 'gamma': 0.5461995811665656, 'reg_alpha': 0.701904559076098, 'reg_lambda': 1.7980227501439583, 'n_estimators': 479}. Best is trial 21 with value: 0.7630240842150035.

In [290]:
params = {'learning_rate': 0.0721584366956849, 'max_depth': 5, 'subsample': 0.5891893757533981, 'colsample_bytree': 0.81349156603394, 'gamma': 0.5461995811665656, 'reg_alpha': 0.701904559076098, 'reg_lambda': 1.7980227501439583, 'n_estimators': 479}

xgb_ = XGBClassifier(**params)

In [291]:
X, y = RandomUnderSampler().fit_resample(X, y)
xgb_.fit(X, y)

In [292]:
X_test = get_data(orders, test, mobile_events, cls_oct)

In [293]:
pred_test = xgb_.predict_proba(X_test)[:, 1]

In [294]:
result = pd.concat((pd.read_csv('test.csv'), pd.DataFrame(pred_test, columns=['apply_promo'])), axis=1)

In [295]:
result.to_csv('submission.csv', index=False)

Результирующая метрика = 70636421,9

15/34 место