In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import catboost as cb
import lightgbm as lgb
pd.options.display.max_columns = None
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

def calc_smooth_mean(df, by, on, weight):
    mean = df[on].mean()
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    smooth = (counts * means + weight * mean) / (counts + weight)
    return df[by].map(smooth)

def add_mte(merged_df, fields, weight, target):
    df_train = merged_df[merged_df.type == 'train']
    df_test = merged_df[merged_df.type == 'test']
    for field in fields:
        df_train[f'{field}_m'] = calc_smooth_mean(df_train, by=field, on=target, weight=weight)
        df_test = pd.merge(df_test, df_train[[field, f'{field}_m']].drop_duplicates(), how='left', on=field)
    return pd.concat([df_train, df_test], axis=0, sort=False)

In [3]:
df_train = pd.read_csv('data/onetwotrip_challenge_train.csv')
df_train['type'] = 'train'
df_test = pd.read_csv('data/onetwotrip_challenge_test.csv')
df_test['type'] = 'test'
print("Shape of train data: ", df_train.shape)
print("Shape of test data: ", df_test.shape)

Shape of train data:  (196056, 44)
Shape of test data:  (455011, 38)


In [4]:
features = list(filter(lambda x: 'field' in x, df_train.columns))
merged_df = pd.concat([df_train, df_test], axis=0, sort=False)
main_features = ['field16', 'field1', 'field12', 'field25', 'field14', 'field22', 'field17', 'field13','field0', 'field8']
grp_features = [x for x in features if x not in main_features]
merged_df.shape

(651067, 44)

In [5]:
for field in main_features:
    for sub_field in grp_features:
        sub_var_mean = merged_df.groupby([field])[sub_field].mean().reset_index().rename(
            columns={sub_field: f"mean_{field}_{sub_field}"}).fillna(0)
        sub_var_median = merged_df.groupby([field])[sub_field].median().reset_index().rename(
            columns={sub_field: f"med_{field}_{sub_field}"}).fillna(0)
        sub_var_std = merged_df.groupby([field])[sub_field].std().reset_index().rename(
            columns={sub_field: f"std_{field}_{sub_field}"}).fillna(0)
        merged_df = pd.merge(merged_df, sub_var_mean, how='left', on=field)
        merged_df = pd.merge(merged_df, sub_var_median, how='left', on=field)
        merged_df = pd.merge(merged_df, sub_var_std, how='left', on=field)
    
    print("Finish ", field)

Finish  field16
Finish  field1
Finish  field12
Finish  field25
Finish  field14
Finish  field22
Finish  field17
Finish  field13
Finish  field0
Finish  field8


In [13]:
for field in features:
    cnt=merged_df.groupby([field]).size().reset_index().rename(columns={0: f"cnt_{field}"})
    merged_df = pd.merge(merged_df, cnt, how='left', on=field)
    merged_df[f'log_{field}'] = np.log(merged_df[field]).replace([np.inf, -np.inf], np.nan).fillna(0)

In [None]:
#merged_df = add_mte(merged_df, features, weight=10, target='goal1')

In [14]:
features = list(filter(lambda x: 'field' in x, merged_df.columns))
df_train = merged_df[merged_df.type=="train"]
df_test = merged_df[merged_df.type=="test"]

In [None]:
df_train['field10'].nunique()

In [None]:
df_train.isnull().sum(axis = 0)

In [None]:
df_train['goal1'].value_counts()

In [18]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=236)
X = df_train[features]
y = df_train['goal1']

# Make importance dataframe
importances = pd.DataFrame()

oof_preds = np.zeros(X.shape[0])
getVal = np.zeros(X.shape[0])
sub_preds = np.zeros(df_test.shape[0])

In [None]:
for n_fold, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
    
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    X_train, y_train = augment(X_train.values, y_train.values)
    print(f"Fold idx: {n_fold + 1}")
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    model = lgb.LGBMClassifier(
        bagging_freq= 5,
        bagging_fraction= 0.335,
        #boost_from_average='false',
        boost= 'gbdt',
        feature_fraction= 0.041,
        learning_rate= 0.01,
        max_depth= -1,
        metric='auc',
        min_data_in_leaf= 80,
        min_sum_hessian_in_leaf= 10.0,
        num_leaves= 13,
        num_threads= 8,
        tree_learner= 'serial',
        objective= 'binary', 
        verbosity= -1,
        n_estimators=10000,
        scale_pos_weight = 10,
        random_state=432013
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=1000,
        early_stopping_rounds=200
    )
    
    oof_preds[val_idx] = model.predict_proba(X.iloc[val_idx])[:, 1]
    getVal[val_idx]+= model.predict_proba(X.iloc[val_idx])[:, 1] / kfolds.n_splits

    sub_preds += model.predict_proba(df_test[features])[:, 1] / kfolds.n_splits
    
print("ROC_AUC score: ", roc_auc_score(y, oof_preds))

In [16]:
for n_fold, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    #X_train, y_train = augment(X_train.values, y_train.values)

    print(f"Fold idx: {n_fold + 1}")
    
    model = cb.CatBoostClassifier(
        #allow_writing_files = False,
        #od_type = 'Iter',
        bagging_temperature = 0.2,
        #depth = 5,
        od_wait = 20,
        #silent = False,
        #verbose = 50
        scale_pos_weight = 44,
        subsample = 0.36, 
        custom_loss='Logloss',
        random_strength = 0,
        max_depth=3,
        eval_metric="AUC",
        learning_rate=0.03,
        iterations=60000,
        #bootstrap_type='Bernoulli',
        l2_leaf_reg=0.3,
        random_seed=432013,
        od_type="Iter",
        border_count=128
    )
    
    # Fit
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=1500,
        early_stopping_rounds=100,
        use_best_model=True
    )
    
    imp_df = pd.DataFrame()
    imp_df['feature'] = features
    imp_df['gain'] = model.feature_importances_
    imp_df['fold'] = n_fold + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_preds[val_idx] = model.predict_proba(X_valid)[:, 1]
    test_preds = model.predict_proba(df_test[features])[:, 1]
    sub_preds += test_preds / kfolds.n_splits
    
print("ROC_AUC score: ", roc_auc_score(y, oof_preds))

Fold idx: 1
0:	test: 0.6530477	test1: 0.6536841	best: 0.6536841 (0)	total: 125ms	remaining: 2h 4m 44s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6905496708
bestIteration = 224

Shrink model to first 225 iterations.
Fold idx: 2
0:	test: 0.6541396	test1: 0.6490752	best: 0.6490752 (0)	total: 64.4ms	remaining: 1h 4m 22s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6778209793
bestIteration = 329

Shrink model to first 330 iterations.
Fold idx: 3
0:	test: 0.6556792	test1: 0.6476543	best: 0.6476543 (0)	total: 63.9ms	remaining: 1h 3m 56s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6867937312
bestIteration = 286

Shrink model to first 287 iterations.
Fold idx: 4
0:	test: 0.6506984	test1: 0.6623750	best: 0.6623750 (0)	total: 83.1ms	remaining: 1h 23m 3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6954769419
bestIteration = 244

Shrink model to first 245 iterations.
Fold idx: 5
0:	test: 0.6565102	te

In [None]:
importances['gain_log'] = importances['gain']
mean_gain = importances[['gain', 'feature']].groupby('feature').mean()
importances['mean_gain'] = importances['feature'].map(mean_gain['gain'])

plt.figure(figsize=(8, 12))
sns.barplot(x='gain_log', y='feature', data=importances.sort_values('mean_gain', ascending=False))

In [None]:
len(sub_preds)

In [17]:
now = datetime. now()
pd.DataFrame(sub_preds, columns=['proba'], 
             index=df_test['orderid']).to_csv(f'sub-{str(now)[:19]}-{round(roc_auc_score(y, oof_preds),4)}.csv')