In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV, cross_val_score, StratifiedShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from functools import partial
import itertools
from sklift.models import ClassTransformation
import lightgbm as lgbm
import ipywidgets as widgets
import inspect
from datetime import timedelta
from sklearn import preprocessing
import gc

In [2]:
import logging

# create logger
logger = logging.getLogger('lg')
logger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)

In [3]:
logger.info('info message')

2020-01-29 18:27:48,052 - lg - INFO - info message


# Utils

In [4]:
BASE_TRANSACTION_TEMPLATE = 'base_transaction'
FAVORITES_TEMPLATE = 'favorites'
STEPS_MAPPING = {
    'BASE': (False, 'base_features.csv'),
    'BASE_TRANSACTION': (False, BASE_TRANSACTION_TEMPLATE),
    'FAVORITES': (False, FAVORITES_TEMPLATE),
    'POPULARITY': (True, 'popularity'),
}

In [5]:
def generate_file_name(prefix, offset):
    return '{}_{}.csv'.format(prefix, str(offset))

In [6]:
def get_features_from_files(offsets):
    features = pd.read_csv(STEPS_MAPPING['BASE'][1], index_col='client_id')
    
    base_trans_array = []
    for offset in offsets:
        offset = offset or ''
        base_trans_array.append(pd.read_csv(generate_file_name(STEPS_MAPPING['BASE_TRANSACTION'][1], offset), 
                                 index_col='client_id'))
    
    for df in base_trans_array:
        features = features.merge(df, left_index=True, right_index=True)
        del df

    gc.collect()
    favorites_array = []
    for offset in offsets:
        offset = offset or ''
        favorites_array.append(pd.read_csv(generate_file_name(STEPS_MAPPING['FAVORITES'][1], offset), 
                                           index_col='client_id'))
    
    for df in favorites_array:
        features = features.merge(df, left_index=True, right_index=True)
        del df
    gc.collect()

    popularity_array = []
    for offset in offsets:
        offset = offset or ''
        popularity_array.append(pd.read_csv(generate_file_name(STEPS_MAPPING['POPULARITY'][1], offset), 
                                           index_col='client_id'))
    
    for df in popularity_array:
        features = features.merge(df, left_index=True, right_index=True)
        del df
    gc.collect()

    
    return features

In [7]:
def uplift_score(prediction, treatment, target, rate=0.3):
    """
    Подсчет Uplift Score
    """
    order = np.argsort(-prediction)

    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()

    control_n = int((treatment == 0).sum() * rate)
    control_p = target[order][treatment[order] == 0][:control_n].mean()

    score = treatment_p - control_p
    return score


def get_train_test(features, df_train, df_test):
    return features.loc[df_train.index, :], features.loc[df_test.index, :]

In [8]:
def balance_learn(X_learn, y_learn):
    _, treatment_counts = np.unique(y_learn.treatment_flg, return_counts=True)
    logger.info("{}, {}, {}".format(X_learn.shape, y_learn.shape, treatment_counts[0] - treatment_counts[1]))
    
    treat_learn = y_learn.treatment_flg
    vc = treat_learn.value_counts()
    treat_learn = pd.concat([treat_learn[treat_learn == i].sample(vc.min()) for i in vc.index])

    X_learn = X_learn.loc[treat_learn.index, :]
    y_learn = y_learn.loc[treat_learn.index, :]
    
    _, treatment_counts = np.unique(y_learn.treatment_flg, return_counts=True)
    logger.info("{}, {}, {}".format(X_learn.shape, y_learn.shape, treatment_counts[0] - treatment_counts[1]))
    return X_learn, y_learn

In [9]:
def uplift_score_func(y_true, y_pred, **kwargs):
    return uplift_score(y_pred, treatment=y_true.treatment_flg, target=y_true.target)

In [10]:
class MyClassTransformation(ClassTransformation):
    def fit(self, X, y, estimator_fit_params=None):
        return  super().fit(X, y=y.target, treatment=y.treatment_flg, estimator_fit_params=estimator_fit_params)

In [11]:
def merge_transactions_and_products(products, transactions):
    columns = ['brand_id', 'vendor_id', 'segment_id', 'product_id']
    transactions_with_products = transactions.merge(products, left_on='product_id', right_index=True)
    logger.info('transactions_with_products')
    for col in columns:
        fg = transactions_with_products.drop_duplicates(subset=['client_id', col]) \
                                       .groupby([col]).size().sort_values(ascending=False)
        fg.name = 'popularity_{}'.format(col)
        
        if col == 'product_id':
            products = products.merge(fg, left_index=True, right_index=True)
        else:
            products = products.join(fg, on=col)
        logger.info('popularity {}'.format(col))
    new_columns = ['popularity_{}'.format(col) for col in columns]
    transactions_with_products = transactions_with_products.merge(products[new_columns], left_on='product_id', right_index=True)
    return transactions_with_products

In [12]:
def get_transactions(df_purchases, offset=None):
    if not offset:
        return df_purchases

    last_date = df_purchases.date.max()    
    sub_df_purchases = df_purchases[df_purchases.date > last_date-timedelta(days=offset)]
    logger.info("sub_df_purchases shape : {}".format(sub_df_purchases.shape))
    return sub_df_purchases

# Чтение данных

In [13]:
# df_products = pd.read_csv('data/products.csv', index_col='product_id')
# logger.info(df_products.shape)

# # df_purchases = pd.read_csv('data/purchases.csv', index_col='transaction_id', parse_dates=['transaction_datetime'], nrows=100000)
# df_purchases = pd.read_csv('data/purchases.csv', index_col='transaction_id', parse_dates=['transaction_datetime'])
# df_purchases['date'] = df_purchases['transaction_datetime'].dt.date

In [14]:
def get_products_features(trans):
    columns = ['brand_id', 'vendor_id', 'segment_id', 'product_id']
    columns2 = ['popularity_{}'.format(col) for col in columns]
    total_pop = trans.groupby('client_id')[columns2].sum()
    logger.info(total_pop.columns[0])
    yield total_pop
    
    avg_trans_pop = trans.groupby(['client_id', 'transaction_id'])[columns2].sum().groupby(['client_id'])[columns2].mean()
    avg_trans_pop.columns = ['avg_trans_{}'.format(c) for c in avg_trans_pop.columns]
    logger.info(avg_trans_pop.columns[0])
    yield avg_trans_pop

    total_unique = trans.groupby(['client_id'])[columns].nunique()
    total_unique.columns = ['total_unique_{}'.format(c) for c in total_unique.columns]
    logger.info(total_unique.columns[0])
    yield total_unique

    avg_trans_unique = trans.groupby(['client_id', 'transaction_id'])[columns].nunique().groupby(['client_id'])[columns].mean()
    avg_trans_unique.columns = ['avg_trans_unique_{}'.format(c) for c in avg_trans_unique.columns]
    logger.info(avg_trans_unique.columns[0])
    yield avg_trans_unique
    
    result = []
    for c in columns[:-1]:
        fc = trans.groupby(['client_id', c])['product_id'].nunique().groupby(['client_id']).mean()
        fc.name = 'avg_nunique_prod_in_{}'.format(c)
        result.append(fc)
    avg_nunique_prod = pd.concat(result, axis=1, sort=False)
    logger.info(avg_nunique_prod.columns[0])
    yield avg_nunique_prod

In [15]:
def pop_features(offset):
    offset = offset or ''
    final = None
    for df in get_products_features(df_purchases):    
        final = final.merge(df, left_index=True, right_index=True) if final is not None else df
    final.index.name = 'client_id'
    if STEPS_MAPPING['POPULARITY'][0]:
        final.to_csv(generate_file_name(STEPS_MAPPING['POPULARITY'][1], offset))
    return final

In [16]:
def base_features(clean=True):
    df_features = pd.read_csv('data/clients.csv', index_col='client_id', parse_dates=['first_issue_date','first_redeem_date'])
    df_features['gender'] = LabelEncoder().fit_transform(df_features.gender)
    df_features['first_issue_time'] = pd.to_datetime(df_features['first_issue_date']).astype(int) / 10 ** 9
    df_features['first_redeem_time'] = pd.to_datetime(df_features['first_redeem_date']).astype(int) / 10 ** 9
    df_features['issue_redeem_delay'] = df_features['first_redeem_time'] - df_features['first_issue_time']
    df_features = df_features.drop(['first_issue_date', 'first_redeem_date'], axis=1)
    if STEPS_MAPPING['BASE'][0]:
        df_features.to_csv(STEPS_MAPPING['BASE'][1])
    
    if clean:
        del df_features
        gc.collect()
        return

    return df_features

In [17]:
def transactions_features(transactions, offset=None, clean=True):
    offset = offset or ''
    last_cols = [
        'regular_points_received', 
        'express_points_received',
        'regular_points_spent',
        'express_points_spent',
        'purchase_sum'
    ]

    logger.info("Create history")
    history = transactions.groupby(['client_id', 'transaction_id'])[last_cols].last()
    
    logger.info("Create _features")
    _features = [
        (history.groupby('client_id')['purchase_sum'].count(), ['total_trans_count']), 
        (history.groupby('client_id').sum(), last_cols)
    ]
    
    _features = list(zip(*_features))
    transactions_features =  pd.concat(_features[0], axis = 1)
    transactions_features.columns = list(itertools.chain.from_iterable(_features[1]))
    transactions_features.columns = ['days_{}_'.format(str(offset)) + c for c in transactions_features.columns]
    if STEPS_MAPPING['BASE_TRANSACTION'][0]:
        transactions_features.to_csv(generate_file_name(STEPS_MAPPING['BASE_TRANSACTION'][1], offset))
    
    if clean:
        del transactions_features
        gc.collect()
        return

    
    return transactions_features

In [18]:
def favorite_products_features(merged_transactions, offset=None, clean=True):

    offset = offset or ''
    # вычисляем любимый продукт/категорию/сегмент для каждого юзера.
    cols = ['product_id', 'brand_id', 'vendor_id', 'segment_id']
    result = []
    for c in cols:
        logger.info("favorite {}".format(c))
        result.append(
            merged_transactions.groupby(['client_id', c]).size().reset_index(name='counts').groupby(['client_id']).max()[c]
        )

    favorites = pd.concat(result, axis=1, sort=False)
    favorites.columns = [str(offset) + '_faivorite_' + c for c in cols]
    favorites.index.name = 'client_id'

    for col in favorites.columns:
        logger.info('LabelEncoder for {}'.format(col))
        favorites[col] = LabelEncoder().fit_transform(favorites[col].astype(str))    

    if STEPS_MAPPING['FAVORITES'][0]:
        favorites.to_csv(generate_file_name(STEPS_MAPPING['FAVORITES'][1], offset))

    if clean:
        del favorites
        gc.collect()
        return        
        
    return favorites

In [19]:
def generate_the_first_part_of_features(offset, df_products, df_purchases):
    sb_df_purchases = get_transactions(df_purchases, offset=offset)
    transactions_features(sb_df_purchases, offset=offset)
    sb_df_purchases = merge_transactions_and_products(df_products, sb_df_purchases)
    favorite_products_features(merged_transactions=sb_df_purchases, offset=offset)

In [20]:
# base_features(df_clients);

In [21]:
# offets = [None]
# for offst in offets:
#     logger.info('{} offst = {}'.format('--'*30, offst))
#     generate_the_first_part_of_features(offst, df_products, df_purchases);

In [22]:
# offset = 14
# df_purchases = merge_transactions_and_products(df_products, get_transactions(df_purchases, offset=offset))
# tpf = pop_features(offset)

In [23]:
offets = [14, 30, None]
features = get_features_from_files(offets)

In [24]:
features.head()

Unnamed: 0_level_0,age,gender,first_issue_time,first_redeem_time,issue_redeem_delay,days_14_total_trans_count,days_14_regular_points_received,days_14_express_points_received,days_14_regular_points_spent,days_14_express_points_spent,...,total_unique_vendor_id,total_unique_segment_id,total_unique_product_id,avg_trans_unique_brand_id,avg_trans_unique_vendor_id,avg_trans_unique_segment_id,avg_trans_unique_product_id,avg_nunique_prod_in_brand_id,avg_nunique_prod_in_vendor_id,avg_nunique_prod_in_segment_id
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000012768d,45,2,1501948000.0,1515094000.0,13146560.0,2,10.0,0.0,0.0,0.0,...,29,23,46,10.5,11.0,11.0,13.0,1.5,1.586207,1.956522
000036f903,72,0,1491832000.0,1492951000.0,1118613.0,3,4.1,0.0,0.0,0.0,...,44,41,96,3.9375,3.96875,4.25,5.0625,1.803922,2.181818,2.268293
000048b7a6,68,0,1544881000.0,-9223372000.0,-10768250000.0,1,1.2,0.0,0.0,0.0,...,21,16,44,4.0,4.375,3.5,7.0,1.904762,2.095238,2.625
000073194a,60,0,1495544000.0,1511522000.0,15978110.0,1,1.3,0.0,0.0,0.0,...,42,27,68,3.823529,3.941176,3.294118,4.823529,1.571429,1.619048,2.37037
00007f9014,45,0,1503409000.0,1550258000.0,46849460.0,2,3.3,0.0,0.0,0.0,...,43,38,71,3.448276,3.413793,3.448276,3.827586,1.38,1.651163,1.763158


In [25]:
features.shape

(346831, 92)

In [26]:
df_train = pd.read_csv('data/uplift_train.csv', index_col='client_id')
logger.info(df_train.shape)

df_test = pd.read_csv('data/uplift_test.csv', index_col='client_id')
logger.info(df_test.shape)

2020-01-29 18:28:01,338 - lg - INFO - (200039, 2)
2020-01-29 18:28:01,483 - lg - INFO - (200123, 0)


In [27]:
x_train, x_test = get_train_test(features, df_train, df_test)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [28]:
x_train.head()

Unnamed: 0_level_0,age,gender,first_issue_time,first_redeem_time,issue_redeem_delay,days_14_total_trans_count,days_14_regular_points_received,days_14_express_points_received,days_14_regular_points_spent,days_14_express_points_spent,...,total_unique_vendor_id,total_unique_segment_id,total_unique_product_id,avg_trans_unique_brand_id,avg_trans_unique_vendor_id,avg_trans_unique_segment_id,avg_trans_unique_product_id,avg_nunique_prod_in_brand_id,avg_nunique_prod_in_vendor_id,avg_nunique_prod_in_segment_id
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000012768d,45.0,2.0,1501948000.0,1515094000.0,13146559.0,2.0,10.0,0.0,0.0,0.0,...,29.0,23.0,46.0,10.5,11.0,11.0,13.0,1.5,1.586207,1.956522
000036f903,72.0,0.0,1491832000.0,1492951000.0,1118613.0,3.0,4.1,0.0,0.0,0.0,...,44.0,41.0,96.0,3.9375,3.96875,4.25,5.0625,1.803922,2.181818,2.268293
00010925a5,83.0,2.0,1532449000.0,1536942000.0,4492280.0,4.0,5.8,0.0,0.0,0.0,...,24.0,31.0,58.0,3.055556,3.222222,3.833333,4.333333,2.037037,2.416667,1.774194
0001f552b0,33.0,0.0,1498850000.0,1535461000.0,36610747.0,4.0,44.7,0.0,0.0,0.0,...,42.0,40.0,79.0,4.466667,4.533333,4.533333,5.733333,1.630435,1.880952,1.925
00020e7b18,73.0,2.0,1511783000.0,1515607000.0,3823700.0,2.0,15.6,0.0,-58.0,-10.0,...,69.0,44.0,175.0,11.5,10.111111,10.388889,15.111111,1.94186,2.536232,3.886364


In [29]:
indices_learn, indices_valid = train_test_split(x_train.index, test_size=0.3)

In [30]:
X_learn = x_train.loc[indices_learn, :]
y_learn = df_train.loc[indices_learn, :]

X_val = x_train.loc[indices_valid, :]
y_val = df_train.loc[indices_valid, :]

In [31]:
params = {'learning_rate':0.01,'max_depth':4,'num_leaves':20,
             'min_data_in_leaf':3, 'application':'binary', 'subsample':0.8, 'colsample_bytree': 0.8,
             'reg_alpha':0.01,'data_random_seed':42,'metric':'binary_logloss',
             'max_bin':416,'bagging_freq':3,'reg_lambda':0.01             
    }

In [None]:
matrix = lgbm.Dataset(X_learn, label=y_learn.target)
cv_result = lgbm.cv(params, matrix, num_boost_round=2000,nfold=5, stratified=True, 
                    shuffle=True, early_stopping_rounds=50, verbose_eval=50)

[50]	cv_agg's binary_logloss: 0.600488 + 0.000389785
[100]	cv_agg's binary_logloss: 0.573186 + 0.00066633
[150]	cv_agg's binary_logloss: 0.560759 + 0.000884604
[200]	cv_agg's binary_logloss: 0.554753 + 0.000996158
[250]	cv_agg's binary_logloss: 0.55151 + 0.00111232
[300]	cv_agg's binary_logloss: 0.549585 + 0.00108661
[350]	cv_agg's binary_logloss: 0.548443 + 0.00101545
[400]	cv_agg's binary_logloss: 0.547586 + 0.00103483
[450]	cv_agg's binary_logloss: 0.546978 + 0.00111082
[500]	cv_agg's binary_logloss: 0.546359 + 0.00111044
[550]	cv_agg's binary_logloss: 0.545984 + 0.00112738
[600]	cv_agg's binary_logloss: 0.545704 + 0.00111188
[650]	cv_agg's binary_logloss: 0.545479 + 0.00108747
[700]	cv_agg's binary_logloss: 0.54528 + 0.00107776
[750]	cv_agg's binary_logloss: 0.545159 + 0.00103656


In [None]:
len(cv_result['binary_logloss-mean'])

In [None]:
params['n_estimators'] = len(cv_result['binary_logloss-mean'])

In [None]:
from sklearn.metrics import make_scorer
parameters = {
    'classifier__estimator__max_depth': [3, 4, 5],
    'classifier__estimator__num_leaves': list(range(20, 100, 15)),
    'classifier__estimator__min_child_samples': list(range(20, 100, 15)),
    'classifier__estimator__class_weight': ['balanced', None]

}
modelcv = GridSearchCV(
    Pipeline(steps=[
        ('classifier', MyClassTransformation(lgbm.LGBMClassifier(**params)))
    ]),
    parameters,
    scoring=make_scorer(uplift_score_func), 
    cv=ShuffleSplit(n_splits=4, test_size=0.3, random_state=12), 
    verbose=3, n_jobs=-1
)
modelcv.fit(X_learn, y_learn);

In [None]:
modelcv.best_estimator_

In [None]:
modelcv.best_score_

In [None]:
final_model = modelcv.best_estimator_

In [None]:
print('Validation score:', uplift_score(final_model.predict(X_val), treatment=y_val.treatment_flg, target=y_val.target))

In [None]:
scores = cross_val_score(
    final_model, x_train, df_train,
    cv=ShuffleSplit(n_splits=10, test_size=0.3), 
    scoring=make_scorer(uplift_score_func)
)
scores

### вычислим доверительный интервал оценки прогноза, чтобы по Public отсеживать overfit 

In [None]:
import scipy.stats as st 

In [None]:
np.mean(scores), st.sem(scores)

In [None]:
st.t.interval(0.95, len(scores)-1, loc=np.mean(scores), scale=st.sem(scores)) 

# Подготовка предсказаний для тестовых клиентов

In [None]:
x_train, df_train = balance_learn(x_train, df_train)

In [None]:
final_model.fit(x_train, df_train)

In [None]:
upl_sc = final_model.predict(x_test)
pd.DataFrame({'client_id':x_test.index.values,'uplift': upl_sc}).to_csv('final_score.csv')

In [None]:
fi = pd.DataFrame({
    'feature_score': final_model.steps[0][1].estimator.feature_importances_
}, index=x_train.columns).sort_values('feature_score')

In [None]:
%matplotlib inline

In [None]:
ax = fi.tail(15).plot.barh()