In [1]:
!pip install scikit-uplift

Collecting scikit-uplift
  Downloading scikit_uplift-0.0.3-py2.py3-none-any.whl (12 kB)
Installing collected packages: scikit-uplift
Successfully installed scikit-uplift-0.0.3


In [2]:
import pandas as pd
import numpy as np
import datetime
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import auc
from sklearn.utils.extmath import stable_cumsum
from sklift.models import SoloModel, ClassTransformation, TwoModels
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_clients = pd.read_csv('../input/x5-retail-hero/clients.csv', index_col='client_id',parse_dates=['first_issue_date','first_redeem_date'])
df_train = pd.read_csv('../input/x5-retail-hero/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('../input/x5-retail-hero/uplift_test.csv', index_col='client_id')
df_products = pd.read_csv('../input/x5-retail-hero/products.csv', index_col='product_id').reset_index()
df_purchases = pd.read_csv('../input/x5-retail-hero/purchases.csv',parse_dates=['transaction_datetime'],index_col='client_id')

In [4]:
#merge df_purchases and df_products
df_purchases.drop(columns=['express_points_received','express_points_spent','trn_sum_from_red'],inplace=True)
df_purchases = df_purchases.merge(df_products[['netto','is_alcohol','is_own_trademark','product_id']], left_on='product_id', right_on='product_id').set_index(df_purchases.index)

In [5]:
#uplift metrics
def uplift_score(prediction, treatment, target, rate=0.3):
    order = np.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * rate)
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    score = treatment_p - control_p
    return score

def get_score(valid_uplift,indices_valid,concat_train):
    valid_score = uplift_score(
        valid_uplift,
        treatment=concat_train.iloc[indices_valid].treatment_flg.values,
        target=concat_train.iloc[indices_valid].target.values,
    )
    return valid_score

def uplift_curve(valid_uplift,indices_valid,concat_train):
    y_true = concat_train.iloc[indices_valid].target.values
    treatment=concat_train.iloc[indices_valid].treatment_flg.values
    uplift = valid_uplift
    
    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment)
    desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1]
    y_true, uplift, treatment = y_true[desc_score_indices], uplift[desc_score_indices], treatment[desc_score_indices]

    y_true_ctrl, y_true_trmnt = y_true.copy(), y_true.copy()

    y_true_ctrl[treatment == 1] = 0
    y_true_trmnt[treatment == 0] = 0

    distinct_value_indices = np.where(np.diff(uplift))[0]
    threshold_indices = np.r_[distinct_value_indices, uplift.size - 1]

    num_trmnt = stable_cumsum(treatment)[threshold_indices]
    y_trmnt = stable_cumsum(y_true_trmnt)[threshold_indices]

    num_all = threshold_indices + 1

    num_ctrl = num_all - num_trmnt
    y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices]

    curve_values = (np.divide(y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt), where=num_trmnt != 0) -\
                    np.divide(y_ctrl, num_ctrl, out=np.zeros_like(y_ctrl), where=num_ctrl != 0)) * num_all

    if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0:
        num_all = np.r_[0, num_all]
        curve_values = np.r_[0, curve_values]

    return num_all, curve_values

def auс_uplift(valid_uplift,indices_valid,concat_train):
    y_true = concat_train.iloc[indices_valid].target.values
    treatment=concat_train.iloc[indices_valid].treatment_flg.values
    uplift = valid_uplift
    
    return auc(*uplift_curve(valid_uplift,indices_valid,concat_train))

In [6]:
#groupby sum,count,mean,nunique
df_purchases['trans_hour'] = df_purchases.transaction_datetime.dt.hour
df_purchases['dayofweek'] = df_purchases.transaction_datetime.dt.dayofweek

group_by_7weeks = df_purchases[df_purchases['transaction_datetime'] > '2019-02-01'].groupby(['client_id','transaction_id'])
group_by_2weeks = df_purchases[df_purchases['transaction_datetime'] > '2019-03-03'].groupby(['client_id','transaction_id'])

last_cols = ['regular_points_received','regular_points_spent', 'purchase_sum','store_id','dayofweek','trans_hour']
all_hist = group_by_7weeks[last_cols].last()
two_weeks = group_by_2weeks[last_cols].last()

sum_cols = ['netto','is_alcohol','is_own_trademark']
all_hist_sum = group_by_7weeks[sum_cols].sum()
two_weeks_sum = group_by_2weeks[sum_cols].sum()

mean_cols = ['product_quantity','netto','trn_sum_from_iss']
all_hist_mean = group_by_7weeks[mean_cols].mean()
two_weeks_mean = group_by_2weeks[mean_cols].mean()

nu_cols = ['product_quantity','product_id']
all_hist_nu = group_by_7weeks[nu_cols].nunique()
two_weeks_nu = group_by_2weeks[nu_cols].nunique()

In [7]:
names = ['regular_points_received','regular_points_spent', 'purchase_sum','dayofweek','trans_hour']

In [8]:
features =  pd.concat([
                    all_hist.groupby('client_id')['purchase_sum'].count(),
                    two_weeks.groupby('client_id')['purchase_sum'].count(),
                    all_hist.groupby('client_id')['dayofweek'].mean(),
                    two_weeks.groupby('client_id')['dayofweek'].mean(),
                    all_hist.groupby('client_id')['trans_hour'].mean(),
                    two_weeks.groupby('client_id')['trans_hour'].mean(),
                    all_hist.groupby('client_id')[['store_id']].nunique(),
                    two_weeks.groupby('client_id')[['store_id']].nunique(),
                    #mean
                    all_hist_mean.groupby('client_id')['product_quantity'].mean(),
                    two_weeks_mean.groupby('client_id')['product_quantity'].mean(),
                    all_hist_mean.groupby('client_id')['netto'].mean(),
                    two_weeks_mean.groupby('client_id')['netto'].mean(),
                    all_hist_mean.groupby('client_id')['trn_sum_from_iss'].mean(),
                    two_weeks_mean.groupby('client_id')['trn_sum_from_iss'].mean(),
                    #nu
                    all_hist_nu.groupby('client_id')['product_id'].mean(),
                    two_weeks_nu.groupby('client_id')['product_id'].mean(),
                    all_hist_nu.groupby('client_id')['product_id'].sum(),
                    two_weeks_nu.groupby('client_id')['product_id'].sum(),
                    all_hist_nu.groupby('client_id')['product_quantity'].mean(),
                    two_weeks_nu.groupby('client_id')['product_quantity'].mean(),
                    #sum
                    all_hist_sum.groupby('client_id')['is_alcohol'].sum(),
                    two_weeks_sum.groupby('client_id')['is_alcohol'].sum(),
                    all_hist_sum.groupby('client_id')['is_own_trademark'].sum(),
                    two_weeks_sum.groupby('client_id')['is_own_trademark'].sum(),
                    #casual
                    all_hist.groupby('client_id').sum(),
                    two_weeks.groupby('client_id').sum()
                      ],axis = 1)

In [10]:
features.columns = ['total_count_purchase','two_weeks_count_purchase']+['total_mean_dayofweek','two_weeks_mean_dayofweek']+\
    ['total_mean_trans_hour','two_weeks_mean_trans_hour']+['total_count_store_id','two_weeks_count_store_id']+\
    ['total_mean_product_quantity','two_weeks_mean_product_quantity']+['total_mean_netto','two_weeks_mean_netto']+\
    ['total_mean_trn_sum_from_iss','two_weeks_mean_trn_sum_from_iss']+\
    ['total_nu_product_id','two_weeks_nu_product_id']+['total_sum_product_id','two_weeks_sum_product_id']+\
    ['total_nu_product_quantity','two_weeks_nu_product_quantity']+\
    ['total_sum_is_alcohol','two_weeks_sum_is_alcohol']+['total_sum_is_own_trademark','two_weeks_sum_is_own_trademark']+\
    list(c+"_all" for c in names)+list(c+"_two_weeks" for c in names)

features.drop(columns=['dayofweek_two_weeks','dayofweek_all','trans_hour_two_weeks','trans_hour_all'],inplace=True)

In [12]:
merged_train = pd.concat([df_train,df_clients,features],axis = 1,sort = True)
merged_train = merged_train[~merged_train['target'].isnull()].copy()
#features
merged_train['diff_quantity'] = merged_train['total_mean_product_quantity']-merged_train['two_weeks_mean_product_quantity']
merged_train['prop_alc'] = merged_train['total_sum_is_alcohol']/merged_train['total_sum_product_id']
merged_train['diff_total_points'] = merged_train['regular_points_received_all']-merged_train['regular_points_spent_all']
merged_train['diff_two_weeks_points'] = merged_train['regular_points_received_two_weeks']-merged_train['regular_points_spent_two_weeks']
merged_train['first_issue_date'] = merged_train['first_issue_date'].astype(int)/10**9
merged_train['first_redeem_date'] = merged_train['first_redeem_date'].astype(int)/10**9
merged_train['diff_time'] = merged_train['first_redeem_date']-merged_train['first_issue_date']
merged_train['gender'] = list(ord(v[0]) for v in merged_train['gender'].values)
#category features
merged_train = merged_train.fillna(0)
for col in ['total_mean_trans_hour','two_weeks_mean_trans_hour','total_mean_dayofweek','two_weeks_mean_dayofweek']:
    merged_train[col] = round(merged_train.total_mean_dayofweek).astype('int')
    merged_train[col] = merged_train[col].astype('category')

In [13]:
#fill wrong age
age_train = merged_train.loc[(merged_train.age>14) & (merged_train.age<100)]
age_test = merged_train.loc[merged_train.age>100]

age_params = {'learning_rate':0.01,'max_depth':6,'num_leaves':20,
             'min_data_in_leaf':30, 'application':'binary',
             'subsample':1, 'colsample_bytree': 0.8,
             'reg_alpha':0.01,'data_random_seed':42,'metric':'binary_logloss'        
                }

X=merged_train.drop(columns=['age'])
y=merged_train.age

kf = KFold(n_splits=5, random_state=None, shuffle=False)
kf.get_n_splits(X, y)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    age_model = lgbm.LGBMRegressor(**age_params)
    age_model.fit(X_train,y_train)
    age_predict = age_model.predict(X_test)
    
age_model.fit(X,y)
#predict age
predicted_age = age_model.predict(age_test.drop(columns=['age']))
merged_train.loc[age_test.index,'age'] = np.around(predicted_age, decimals=0)

# Validate

In [14]:
def valid(concat_train):
    final_auc_uplift_score=[]
    final_uplift_score=[]
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(concat_train, concat_train.target)
    
    for indices_learn, indices_valid in skf.split(concat_train, concat_train.target):
        params_t = {'learning_rate':0.01,'max_depth':6,'num_leaves':10,
                     'min_data_in_leaf':10, 'application':'binary',
                     'subsample':0.75, 'colsample_bytree': 0.8,
                     'reg_alpha':0.5,'data_random_seed':12,'metric':'binary_logloss',
                     'max_bin':450,'bagging_freq':2,'reg_lambda':0.5         
            }
        #fit
        transformation_model1 = ClassTransformation(AdaBoostClassifier(n_estimators=30,base_estimator=RandomForestClassifier(max_depth=1)))
        transformation_model1.fit(
            concat_train.iloc[indices_learn,:].drop(columns=['treatment_flg','target']),
            concat_train.iloc[indices_learn,:]['treatment_flg'].values,
            concat_train.iloc[indices_learn,:].target)
        transformation_model2 = ClassTransformation(lgbm.LGBMClassifier(**params_t))
        transformation_model2.fit(
            concat_train.iloc[indices_learn,:].drop(columns=['treatment_flg','target']),
            concat_train.iloc[indices_learn,:]['treatment_flg'].values,
            concat_train.iloc[indices_learn,:].target)
        
        #valid
        X_valid = concat_train.iloc[indices_valid, :].drop(columns=['treatment_flg','target'])
        #predict
        predict_valid = (transformation_model1.predict(X_valid)+transformation_model2.predict(X_valid))/2
        
        print('AUC uplift score:', round(auс_uplift(predict_valid,indices_valid,concat_train)/10**7,3))
        print('Right uplift score:', round(get_score(predict_valid,indices_valid,concat_train),4),'\n')
        
        final_auc_uplift_score.append(auс_uplift(predict_valid,indices_valid,concat_train))
        final_uplift_score.append(get_score(predict_valid,indices_valid,concat_train))  

    print('final auc uplift score =',round(sum(final_auc_uplift_score)/len(final_auc_uplift_score)/10**7,3))
    print('final uplift score =',sum(final_uplift_score)/len(final_uplift_score))
    
    return round(sum(final_auc_uplift_score)/len(final_auc_uplift_score)/10**7,3),sum(final_uplift_score)/len(final_uplift_score)

In [None]:
valid(merged_train)

AUC uplift score: 3.734
Right uplift score: 0.0738 

AUC uplift score: 3.878
Right uplift score: 0.0699 

AUC uplift score: 4.633
Right uplift score: 0.0949 



# Predict

In [None]:
def predict(concat_train):
    x_names = list(concat_train.iloc[:,2:].columns)
    diff = []
    
    for rs in range(1,2,1):
        params_t = {'learning_rate':0.01,'max_depth':6,'num_leaves':10,
                     'min_data_in_leaf':10, 'application':'binary',
                     'subsample':0.75, 'colsample_bytree': 0.8,
                     'reg_alpha':0.5,'data_random_seed':12,'metric':'binary_logloss',
                     'max_bin':450,'bagging_freq':2,'reg_lambda':0.5         
            }
        #fit       
        transformation_model1 = ClassTransformation(AdaBoostClassifier(n_estimators=30,base_estimator=RandomForestClassifier(max_depth=1)))
        transformation_model1.fit(
            concat_train.drop(columns=['treatment_flg','target']),
            concat_train['treatment_flg'].values,
            concat_train.target)
        transformation_model2 = ClassTransformation(lgbm.LGBMClassifier(**params_t))
        transformation_model2.fit(
            concat_train.drop(columns=['treatment_flg','target']),
            concat_train['treatment_flg'].values,
            concat_train.target)

        #test features
        df_test['target'] = 1
        merged_test = pd.concat([df_test,df_clients,features],axis = 1,sort = True)
        merged_test = merged_test[~merged_test['target'].isnull()].copy()
        merged_test['diff_quantity'] = merged_test['total_mean_product_quantity']-merged_test['two_weeks_mean_product_quantity']
        merged_test['prop_alc'] = merged_test['total_sum_is_alcohol']/merged_test['total_sum_product_id']
        merged_test['diff_total_points'] = merged_test['regular_points_received_all']-merged_test['regular_points_spent_all']
        merged_test['diff_two_weeks_points'] = merged_test['regular_points_received_two_weeks']-\
                                                    merged_test['regular_points_spent_two_weeks']
        merged_test['first_issue_date'] = merged_test['first_issue_date'].astype(int)/10**9
        merged_test['first_redeem_date'] = merged_test['first_redeem_date'].astype(int)/10**9
        merged_test['diff_time'] = merged_test['first_redeem_date']-merged_test['first_issue_date']
        merged_test['gender'] = list(ord(v[0]) for v in merged_test['gender'].values)
        merged_test = merged_test.fillna(0)
        for col in ['total_mean_trans_hour','two_weeks_mean_trans_hour','total_mean_dayofweek','two_weeks_mean_dayofweek']:
            merged_test[col] = round(merged_test.total_mean_dayofweek).astype('int')
            merged_test[col] = merged_test[col].astype('category')
            
        test_x = merged_test[x_names].fillna(0)
        predict_test = (transformation_model1.predict(test_x)+transformation_model2.predict(test_x))/2
        diff.append(np.array(predict_test))
    return sum(diff)/len(diff),test_x

In [None]:
#submit
uplift_prediction, test_x = predict(merged_train.fillna(0))
df_submission = pd.DataFrame({'client_id':test_x.index.values,'uplift': uplift_prediction})
df_submission.to_csv('submission.csv',index = False)