In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool,CatBoost, CatBoostRegressor
from collections import Counter

In [2]:
data_path = 'dataset/'
customer_train = pd.read_csv(f'{data_path}customer_train.csv')
customer_test = pd.read_csv(f'{data_path}customer_test.csv')
stories_description = pd.read_csv(f'{data_path}stories_description.csv')
stories_reaction_test = pd.read_csv(f'{data_path}stories_reaction_test.csv')
stories_reaction_test['event_dttm']=pd.to_datetime(stories_reaction_test['event_dttm'],infer_datetime_format=True)
stories_reaction_train = pd.read_csv(f'{data_path}stories_reaction_train.csv')
stories_reaction_train['event_dttm']=pd.to_datetime(stories_reaction_train['event_dttm'],infer_datetime_format=True)
transactions = pd.read_csv(f'{data_path}transactions.csv')


sample_submit = pd.read_csv(f'{data_path}sample_submit.csv')

In [87]:
train_and_val = stories_reaction_train.merge(customer_train, on='customer_id', how='left')
test = stories_reaction_test.merge(customer_test, on='customer_id', how='left')


In [4]:
transactions.customer_id.value_counts()

514522     804
588639     788
333041     768
588106     700
673392     690
          ... 
542958       1
462666       1
743422       1
1010076      1
195566       1
Name: customer_id, Length: 46948, dtype: int64

In [88]:
train_dates = stories_reaction_train['event_dttm'].astype(np.int64)
test_dates = stories_reaction_test['event_dttm'].astype(np.int64)
period = train_dates.max() - (test_dates.max() - test_dates.min())

In [89]:
val = train_and_val[train_and_val.event_dttm.astype(np.int64) > period]
train = train_and_val[train_and_val.event_dttm.astype(np.int64) <= period]

In [90]:
len(train), len(val) 

(320248, 152893)

In [91]:
train=train.fillna(-999)
val=val.fillna(-999)
test=test.fillna(-999)

In [92]:
drop_columns = ['event_dttm', 'first_session_dttm']
for c in drop_columns:
    del train[c]
    del val[c]
    del test[c]

In [93]:
def reaction_to_weight(reaction):
    if reaction =='dislike':
        return 10.0*34.5
    if reaction =='like':
        return 0.5*3.8
    if reaction =='view':
        return 0.1
    if reaction =='skip':
        return 0.1*1.4
    
def reaction_to_class(reaction):
    if reaction =='dislike':
        return -1
    if reaction =='like':
        return 1
    if reaction =='view':
        return 1
    if reaction =='skip':
        return -1

In [94]:
train_class = train.event.values
trainY = [reaction_to_class(reaction)  for reaction in train.event.values]
train_weights = [reaction_to_weight(reaction) for reaction in train.event.values]
del train['event']
val_class = val.event.values
valY = [reaction_to_class(reaction) for reaction in val.event.values]
val_weights = [reaction_to_weight(reaction) for reaction in val.event.values]
del val['event']



In [95]:
cat_columns = ['customer_id',
               'product_0', 'product_1', 'product_2', 'product_3', 'product_4', 'product_5', 'product_6',
               'gender_cd', 'marital_status_cd', 'job_position_cd', 'job_title']

val_data = Pool(val,
                valY,
                 cat_features=cat_columns,
               weight=val_weights
               )
train_data = Pool(train,
                  trainY,
                  cat_features=cat_columns,
                 weight=train_weights
                 )

In [105]:
model = CatBoostClassifier(iterations=300,
                           depth=4, 
#                            scale_pos_weight=1/pds[0].isLiked.mean(),
                           learning_rate=0.05,
#                            loss_function='Logloss', 
#                            logging_level='Verbose',
#                            eval_metric='AUC',
# #                            has_time=True,
#                            bagging_temperature=0.8,
#                            max_bin=128,
#                            random_strength=2,
#                            max_ctr_complexity=2,
#                            l2_leaf_reg=30,
# #                            rsm=0.2
#                            use_best_model=False,
                           task_type='GPU',
# #                            used_ram_limit='11GB'
                 
                          )

print('model')
#train the model
model.fit(train_data,
          eval_set=val_data
         )

model
0:	learn: 0.6309962	test: 0.6300043	best: 0.6300043 (0)	total: 19.2ms	remaining: 5.73s
1:	learn: 0.5758782	test: 0.5753344	best: 0.5753344 (1)	total: 42.6ms	remaining: 6.35s
2:	learn: 0.5280578	test: 0.5274836	best: 0.5274836 (2)	total: 64.1ms	remaining: 6.35s
3:	learn: 0.4863352	test: 0.4855487	best: 0.4855487 (3)	total: 89.5ms	remaining: 6.63s
4:	learn: 0.4518085	test: 0.4502437	best: 0.4502437 (4)	total: 110ms	remaining: 6.47s
5:	learn: 0.4221332	test: 0.4196574	best: 0.4196574 (5)	total: 131ms	remaining: 6.43s
6:	learn: 0.3950892	test: 0.3918979	best: 0.3918979 (6)	total: 149ms	remaining: 6.25s
7:	learn: 0.3717403	test: 0.3677514	best: 0.3677514 (7)	total: 163ms	remaining: 5.94s
8:	learn: 0.3512027	test: 0.3471489	best: 0.3471489 (8)	total: 182ms	remaining: 5.9s
9:	learn: 0.3343085	test: 0.3298725	best: 0.3298725 (9)	total: 198ms	remaining: 5.75s
10:	learn: 0.3195113	test: 0.3146598	best: 0.3146598 (10)	total: 215ms	remaining: 5.66s
11:	learn: 0.3065734	test: 0.3014351	best: 

103:	learn: 0.2017894	test: 0.1943922	best: 0.1943922 (103)	total: 1.89s	remaining: 3.57s
104:	learn: 0.2017752	test: 0.1943771	best: 0.1943771 (104)	total: 1.91s	remaining: 3.55s
105:	learn: 0.2016872	test: 0.1944167	best: 0.1943771 (104)	total: 1.93s	remaining: 3.53s
106:	learn: 0.2016510	test: 0.1944325	best: 0.1943771 (104)	total: 1.95s	remaining: 3.51s
107:	learn: 0.2015826	test: 0.1945446	best: 0.1943771 (104)	total: 1.96s	remaining: 3.49s
108:	learn: 0.2015028	test: 0.1945388	best: 0.1943771 (104)	total: 1.98s	remaining: 3.47s
109:	learn: 0.2014683	test: 0.1946144	best: 0.1943771 (104)	total: 2s	remaining: 3.45s
110:	learn: 0.2014565	test: 0.1946141	best: 0.1943771 (104)	total: 2.02s	remaining: 3.43s
111:	learn: 0.2014123	test: 0.1946394	best: 0.1943771 (104)	total: 2.03s	remaining: 3.42s
112:	learn: 0.2014089	test: 0.1946373	best: 0.1943771 (104)	total: 2.05s	remaining: 3.4s
113:	learn: 0.2013916	test: 0.1946372	best: 0.1943771 (104)	total: 2.07s	remaining: 3.38s
114:	learn: 0.

204:	learn: 0.1985559	test: 0.1966143	best: 0.1943771 (104)	total: 3.8s	remaining: 1.76s
205:	learn: 0.1985546	test: 0.1966158	best: 0.1943771 (104)	total: 3.82s	remaining: 1.74s
206:	learn: 0.1985397	test: 0.1965516	best: 0.1943771 (104)	total: 3.84s	remaining: 1.73s
207:	learn: 0.1985166	test: 0.1966189	best: 0.1943771 (104)	total: 3.86s	remaining: 1.71s
208:	learn: 0.1985018	test: 0.1966900	best: 0.1943771 (104)	total: 3.87s	remaining: 1.69s
209:	learn: 0.1984354	test: 0.1966881	best: 0.1943771 (104)	total: 3.89s	remaining: 1.67s
210:	learn: 0.1984095	test: 0.1966899	best: 0.1943771 (104)	total: 3.91s	remaining: 1.65s
211:	learn: 0.1983786	test: 0.1967344	best: 0.1943771 (104)	total: 3.92s	remaining: 1.63s
212:	learn: 0.1983624	test: 0.1967283	best: 0.1943771 (104)	total: 3.94s	remaining: 1.61s
213:	learn: 0.1983534	test: 0.1967455	best: 0.1943771 (104)	total: 3.96s	remaining: 1.59s
214:	learn: 0.1983435	test: 0.1967508	best: 0.1943771 (104)	total: 3.97s	remaining: 1.57s
215:	learn:

297:	learn: 0.1964886	test: 0.1996413	best: 0.1943771 (104)	total: 5.5s	remaining: 36.9ms
298:	learn: 0.1964565	test: 0.1996256	best: 0.1943771 (104)	total: 5.52s	remaining: 18.5ms
299:	learn: 0.1964467	test: 0.1996237	best: 0.1943771 (104)	total: 5.54s	remaining: 0us
bestTest = 0.1943771369
bestIteration = 104
Shrink model to first 105 iterations.


<catboost.core.CatBoostClassifier at 0x7f9e7051bd68>

In [97]:
columns=train.columns
x = model.get_feature_importance(train_data)
for i in range(len(x)):
    print(columns[i], x[i])

customer_id 60.19305629910757
story_id 10.534362647404947
product_0 0.9506436074873177
product_1 0.1746532916654849
product_2 0.6569158776234072
product_3 0.0
product_4 0.5636636615403703
product_5 3.1604962712154423
product_6 1.1419763628845778
gender_cd 7.003588502157351
age 8.641628096987858
marital_status_cd 0.44631169603355364
children_cnt 2.199113115122494
job_position_cd 0.25559610377213704
job_title 4.077994466997489


In [82]:
160027/4628

34.57800345721694

In [74]:
model.predict_proba(train_data)

array([[0.51479455, 0.48520545],
       [0.51479455, 0.48520545],
       [0.69756948, 0.30243052],
       ...,
       [0.31696796, 0.68303204],
       [0.39236524, 0.60763476],
       [0.41481388, 0.58518612]])

In [75]:
Counter(train_class)


Counter({'skip': 113598, 'view': 160027, 'like': 41995, 'dislike': 4628})

In [106]:
def metric(predicts, Ys):
    counter = Counter(Ys)
    max_value = counter['view']*0.1+counter['skip']*0.1+counter['like']*0.5+counter['dislike']*10
    value = 0
    for y,p in zip(Ys,predicts):
        if y == "dislike":
            value-=10*p
        if y == "like":
            value+=0.5*p
        if y =='view':
            value+=0.1*p
        if y =='skip':
            value-=0.1*p
    return value/max_value
            
            

In [116]:
train_predict = (model.predict_proba(train_data)[:,1]-0.5)*2
val_predict = (model.predict_proba(val_data)[:,1]-0.5)*2

print('train metric ' + str(metric(train_predict, train_class)))
print('val metric ' + str(metric(val_predict, val_class)))

train metric 0.22542627766837098
val metric 0.21883847275582882


In [108]:
test_id = test.answer_id
del test['answer_id']

In [109]:
test_id

array([     0,      1,      2, ..., 172046, 172047, 172048])

In [113]:
test['score'] = (model.predict_proba(test)[:,1]-0.5)*2
test['answer_id'] = test_id

In [119]:
test[['answer_id', 'score']].to_csv("predict.csv", index=False, float_format='%.3f')