In [117]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool,CatBoost, CatBoostRegressor
from collections import Counter

In [118]:
data_path = 'dataset/'
customer_train = pd.read_csv(f'{data_path}customer_train.csv')
customer_test = pd.read_csv(f'{data_path}customer_test.csv')
stories_description = pd.read_csv(f'{data_path}stories_description.csv')
stories_reaction_test = pd.read_csv(f'{data_path}stories_reaction_test.csv')
stories_reaction_test['event_dttm']=pd.to_datetime(stories_reaction_test['event_dttm'],infer_datetime_format=True)
stories_reaction_train = pd.read_csv(f'{data_path}stories_reaction_train.csv')
stories_reaction_train['event_dttm']=pd.to_datetime(stories_reaction_train['event_dttm'],infer_datetime_format=True)
transactions = pd.read_csv(f'{data_path}transactions.csv')


sample_submit = pd.read_csv(f'{data_path}sample_submit.csv')

In [119]:
train_and_val = stories_reaction_train.merge(customer_train, on='customer_id', how='left')
test = stories_reaction_test.merge(customer_test, on='customer_id', how='left')


In [120]:
train_dates = stories_reaction_train['event_dttm'].astype(np.int64)
test_dates = stories_reaction_test['event_dttm'].astype(np.int64)
period = train_dates.max() - (test_dates.max() - test_dates.min())

In [121]:
val = train_and_val[train_and_val.event_dttm.astype(np.int64) > period]
train = train_and_val[train_and_val.event_dttm.astype(np.int64) <= period]

In [122]:
len(train), len(val) 

(320248, 152893)

In [123]:
train=train.fillna(-999)
val=val.fillna(-999)
test=test.fillna(-999)

In [124]:
drop_columns = ['event_dttm', 'first_session_dttm']
for c in drop_columns:
    del train[c]
    del val[c]
    del test[c]

In [125]:
def reaction_to_weight(reaction):
    if reaction =='dislike':
        return 10.0
    if reaction =='like':
        return 0.5
    if reaction =='view':
        return 0.1
    if reaction =='skip':
        return 0.1
    
def reaction_to_class(reaction):
    if reaction =='dislike':
        return -1
    if reaction =='like':
        return 1
    if reaction =='view':
        return 1
    if reaction =='skip':
        return -1

In [126]:
train_class = train.event.values
trainY = [reaction_to_class(reaction)  for reaction in train.event.values]
train_weights = [reaction_to_weight(reaction) for reaction in train.event.values]
del train['event']
val_class = val.event.values
valY = [reaction_to_class(reaction) for reaction in val.event.values]
val_weights = [reaction_to_weight(reaction) for reaction in val.event.values]
del val['event']

In [143]:
cat_columns = ['customer_id',
               'product_0', 'product_1', 'product_2', 'product_3', 'product_4', 'product_5', 'product_6',
               'gender_cd', 'marital_status_cd', 'job_position_cd', 'job_title']

val_data = Pool(val,
                valY,
                 cat_features=cat_columns,
               weight=val_weights
               )
train_data = Pool(train,
                  trainY,
                  cat_features=cat_columns,
                 weight=train_weights
                 )

In [144]:
model = CatBoostRegressor(iterations=1000,
#                            depth=6, 
#                            scale_pos_weight=1/pds[0].isLiked.mean(),
#                            learning_rate=0.07,
#                            loss_function='Logloss', 
#                            logging_level='Verbose',
#                            eval_metric='AUC',
# #                            has_time=True,
#                            bagging_temperature=0.8,
#                            max_bin=128,
#                            random_strength=0.8,
#                            max_ctr_complexity=2,
#                            l2_leaf_reg=3,
# #                            rsm=0.2
                           task_type='GPU',
# #                            used_ram_limit='11GB'
                 
                          )

print('model')
#train the model
model.fit(train_data,
          eval_set=val_data
         )

model
0:	learn: 0.9738722	test: 0.9716430	best: 0.9716430 (0)	total: 28.6ms	remaining: 28.6s
1:	learn: 0.9719889	test: 0.9701089	best: 0.9701089 (1)	total: 62ms	remaining: 30.9s
2:	learn: 0.9700839	test: 0.9686451	best: 0.9686451 (2)	total: 90.7ms	remaining: 30.1s
3:	learn: 0.9683364	test: 0.9671643	best: 0.9671643 (3)	total: 114ms	remaining: 28.5s
4:	learn: 0.9666641	test: 0.9657715	best: 0.9657715 (4)	total: 133ms	remaining: 26.5s
5:	learn: 0.9650115	test: 0.9645381	best: 0.9645381 (5)	total: 149ms	remaining: 24.6s
6:	learn: 0.9634973	test: 0.9634395	best: 0.9634395 (6)	total: 171ms	remaining: 24.3s
7:	learn: 0.9620950	test: 0.9623569	best: 0.9623569 (7)	total: 193ms	remaining: 23.9s
8:	learn: 0.9606349	test: 0.9612774	best: 0.9612774 (8)	total: 214ms	remaining: 23.6s
9:	learn: 0.9593325	test: 0.9603167	best: 0.9603167 (9)	total: 236ms	remaining: 23.4s
10:	learn: 0.9581342	test: 0.9594145	best: 0.9594145 (10)	total: 257ms	remaining: 23.1s
11:	learn: 0.9569097	test: 0.9585469	best: 0.

99:	learn: 0.9247302	test: 0.9443396	best: 0.9443228 (98)	total: 2.09s	remaining: 18.8s
100:	learn: 0.9246168	test: 0.9443432	best: 0.9443228 (98)	total: 2.11s	remaining: 18.8s
101:	learn: 0.9244843	test: 0.9443240	best: 0.9443228 (98)	total: 2.13s	remaining: 18.8s
102:	learn: 0.9243172	test: 0.9443444	best: 0.9443228 (98)	total: 2.16s	remaining: 18.8s
103:	learn: 0.9241709	test: 0.9443837	best: 0.9443228 (98)	total: 2.19s	remaining: 18.8s
104:	learn: 0.9239669	test: 0.9443758	best: 0.9443228 (98)	total: 2.21s	remaining: 18.9s
105:	learn: 0.9237738	test: 0.9444040	best: 0.9443228 (98)	total: 2.24s	remaining: 18.9s
106:	learn: 0.9236093	test: 0.9443705	best: 0.9443228 (98)	total: 2.26s	remaining: 18.9s
107:	learn: 0.9234622	test: 0.9443372	best: 0.9443228 (98)	total: 2.28s	remaining: 18.9s
108:	learn: 0.9233457	test: 0.9443012	best: 0.9443012 (108)	total: 2.3s	remaining: 18.8s
109:	learn: 0.9231580	test: 0.9443170	best: 0.9443012 (108)	total: 2.33s	remaining: 18.8s
110:	learn: 0.9230387

196:	learn: 0.9142008	test: 0.9434191	best: 0.9434191 (196)	total: 4.22s	remaining: 17.2s
197:	learn: 0.9141481	test: 0.9434052	best: 0.9434052 (197)	total: 4.24s	remaining: 17.2s
198:	learn: 0.9140970	test: 0.9435613	best: 0.9434052 (197)	total: 4.27s	remaining: 17.2s
199:	learn: 0.9140026	test: 0.9436744	best: 0.9434052 (197)	total: 4.29s	remaining: 17.1s
200:	learn: 0.9139516	test: 0.9436609	best: 0.9434052 (197)	total: 4.31s	remaining: 17.1s
201:	learn: 0.9138637	test: 0.9436405	best: 0.9434052 (197)	total: 4.33s	remaining: 17.1s
202:	learn: 0.9137621	test: 0.9437634	best: 0.9434052 (197)	total: 4.35s	remaining: 17.1s
203:	learn: 0.9136601	test: 0.9437804	best: 0.9434052 (197)	total: 4.37s	remaining: 17.1s
204:	learn: 0.9135402	test: 0.9438319	best: 0.9434052 (197)	total: 4.39s	remaining: 17s
205:	learn: 0.9134173	test: 0.9438221	best: 0.9434052 (197)	total: 4.41s	remaining: 17s
206:	learn: 0.9132779	test: 0.9437929	best: 0.9434052 (197)	total: 4.43s	remaining: 17s
207:	learn: 0.91

294:	learn: 0.9065562	test: 0.9453637	best: 0.9434052 (197)	total: 6.3s	remaining: 15.1s
295:	learn: 0.9065039	test: 0.9453189	best: 0.9434052 (197)	total: 6.32s	remaining: 15s
296:	learn: 0.9064323	test: 0.9453459	best: 0.9434052 (197)	total: 6.34s	remaining: 15s
297:	learn: 0.9064061	test: 0.9453574	best: 0.9434052 (197)	total: 6.36s	remaining: 15s
298:	learn: 0.9063572	test: 0.9453482	best: 0.9434052 (197)	total: 6.38s	remaining: 14.9s
299:	learn: 0.9063270	test: 0.9453389	best: 0.9434052 (197)	total: 6.39s	remaining: 14.9s
300:	learn: 0.9062779	test: 0.9453628	best: 0.9434052 (197)	total: 6.41s	remaining: 14.9s
301:	learn: 0.9061860	test: 0.9454722	best: 0.9434052 (197)	total: 6.44s	remaining: 14.9s
302:	learn: 0.9060407	test: 0.9454635	best: 0.9434052 (197)	total: 6.46s	remaining: 14.8s
303:	learn: 0.9059472	test: 0.9454088	best: 0.9434052 (197)	total: 6.48s	remaining: 14.8s
304:	learn: 0.9058175	test: 0.9453904	best: 0.9434052 (197)	total: 6.49s	remaining: 14.8s
305:	learn: 0.905

387:	learn: 0.9009783	test: 0.9470589	best: 0.9434052 (197)	total: 8.18s	remaining: 12.9s
388:	learn: 0.9009206	test: 0.9470247	best: 0.9434052 (197)	total: 8.2s	remaining: 12.9s
389:	learn: 0.9008341	test: 0.9469946	best: 0.9434052 (197)	total: 8.22s	remaining: 12.9s
390:	learn: 0.9007527	test: 0.9469670	best: 0.9434052 (197)	total: 8.25s	remaining: 12.8s
391:	learn: 0.9006989	test: 0.9469713	best: 0.9434052 (197)	total: 8.27s	remaining: 12.8s
392:	learn: 0.9006267	test: 0.9470197	best: 0.9434052 (197)	total: 8.29s	remaining: 12.8s
393:	learn: 0.9005672	test: 0.9470820	best: 0.9434052 (197)	total: 8.31s	remaining: 12.8s
394:	learn: 0.9005086	test: 0.9470462	best: 0.9434052 (197)	total: 8.32s	remaining: 12.8s
395:	learn: 0.9004579	test: 0.9471596	best: 0.9434052 (197)	total: 8.34s	remaining: 12.7s
396:	learn: 0.9003970	test: 0.9471549	best: 0.9434052 (197)	total: 8.36s	remaining: 12.7s
397:	learn: 0.9002978	test: 0.9470841	best: 0.9434052 (197)	total: 8.38s	remaining: 12.7s
398:	learn:

479:	learn: 0.8958203	test: 0.9483401	best: 0.9434052 (197)	total: 10.1s	remaining: 10.9s
480:	learn: 0.8957573	test: 0.9483619	best: 0.9434052 (197)	total: 10.1s	remaining: 10.9s
481:	learn: 0.8956985	test: 0.9483305	best: 0.9434052 (197)	total: 10.1s	remaining: 10.9s
482:	learn: 0.8956648	test: 0.9483202	best: 0.9434052 (197)	total: 10.1s	remaining: 10.8s
483:	learn: 0.8956131	test: 0.9483730	best: 0.9434052 (197)	total: 10.1s	remaining: 10.8s
484:	learn: 0.8955372	test: 0.9483285	best: 0.9434052 (197)	total: 10.2s	remaining: 10.8s
485:	learn: 0.8954802	test: 0.9482732	best: 0.9434052 (197)	total: 10.2s	remaining: 10.8s
486:	learn: 0.8954464	test: 0.9485416	best: 0.9434052 (197)	total: 10.2s	remaining: 10.8s
487:	learn: 0.8953650	test: 0.9485130	best: 0.9434052 (197)	total: 10.2s	remaining: 10.7s
488:	learn: 0.8952951	test: 0.9487547	best: 0.9434052 (197)	total: 10.2s	remaining: 10.7s
489:	learn: 0.8952682	test: 0.9491034	best: 0.9434052 (197)	total: 10.3s	remaining: 10.7s
490:	learn

580:	learn: 0.8904200	test: 0.9507996	best: 0.9434052 (197)	total: 12.2s	remaining: 8.77s
581:	learn: 0.8903861	test: 0.9507756	best: 0.9434052 (197)	total: 12.2s	remaining: 8.75s
582:	learn: 0.8903304	test: 0.9507606	best: 0.9434052 (197)	total: 12.2s	remaining: 8.73s
583:	learn: 0.8902680	test: 0.9507516	best: 0.9434052 (197)	total: 12.2s	remaining: 8.71s
584:	learn: 0.8902022	test: 0.9510987	best: 0.9434052 (197)	total: 12.3s	remaining: 8.69s
585:	learn: 0.8901672	test: 0.9510694	best: 0.9434052 (197)	total: 12.3s	remaining: 8.67s
586:	learn: 0.8900717	test: 0.9510413	best: 0.9434052 (197)	total: 12.3s	remaining: 8.65s
587:	learn: 0.8899832	test: 0.9510092	best: 0.9434052 (197)	total: 12.3s	remaining: 8.63s
588:	learn: 0.8899297	test: 0.9511223	best: 0.9434052 (197)	total: 12.3s	remaining: 8.6s
589:	learn: 0.8898566	test: 0.9512128	best: 0.9434052 (197)	total: 12.3s	remaining: 8.58s
590:	learn: 0.8898164	test: 0.9513940	best: 0.9434052 (197)	total: 12.4s	remaining: 8.56s
591:	learn:

678:	learn: 0.8857895	test: 0.9550155	best: 0.9434052 (197)	total: 14.2s	remaining: 6.73s
679:	learn: 0.8857077	test: 0.9550032	best: 0.9434052 (197)	total: 14.3s	remaining: 6.71s
680:	learn: 0.8856515	test: 0.9550354	best: 0.9434052 (197)	total: 14.3s	remaining: 6.69s
681:	learn: 0.8856089	test: 0.9550058	best: 0.9434052 (197)	total: 14.3s	remaining: 6.67s
682:	learn: 0.8855209	test: 0.9550028	best: 0.9434052 (197)	total: 14.3s	remaining: 6.65s
683:	learn: 0.8854580	test: 0.9550018	best: 0.9434052 (197)	total: 14.3s	remaining: 6.63s
684:	learn: 0.8854362	test: 0.9550144	best: 0.9434052 (197)	total: 14.4s	remaining: 6.61s
685:	learn: 0.8854029	test: 0.9550072	best: 0.9434052 (197)	total: 14.4s	remaining: 6.58s
686:	learn: 0.8853403	test: 0.9549325	best: 0.9434052 (197)	total: 14.4s	remaining: 6.56s
687:	learn: 0.8853000	test: 0.9548919	best: 0.9434052 (197)	total: 14.4s	remaining: 6.54s
688:	learn: 0.8852534	test: 0.9548953	best: 0.9434052 (197)	total: 14.5s	remaining: 6.52s
689:	learn

780:	learn: 0.8811678	test: 0.9579937	best: 0.9434052 (197)	total: 16.4s	remaining: 4.58s
781:	learn: 0.8811412	test: 0.9579532	best: 0.9434052 (197)	total: 16.4s	remaining: 4.56s
782:	learn: 0.8811070	test: 0.9579265	best: 0.9434052 (197)	total: 16.4s	remaining: 4.54s
783:	learn: 0.8810720	test: 0.9579468	best: 0.9434052 (197)	total: 16.4s	remaining: 4.52s
784:	learn: 0.8810250	test: 0.9579577	best: 0.9434052 (197)	total: 16.4s	remaining: 4.5s
785:	learn: 0.8809946	test: 0.9579062	best: 0.9434052 (197)	total: 16.5s	remaining: 4.48s
786:	learn: 0.8809619	test: 0.9579704	best: 0.9434052 (197)	total: 16.5s	remaining: 4.46s
787:	learn: 0.8809233	test: 0.9579464	best: 0.9434052 (197)	total: 16.5s	remaining: 4.44s
788:	learn: 0.8809005	test: 0.9579432	best: 0.9434052 (197)	total: 16.5s	remaining: 4.42s
789:	learn: 0.8808697	test: 0.9579265	best: 0.9434052 (197)	total: 16.5s	remaining: 4.39s
790:	learn: 0.8808467	test: 0.9580363	best: 0.9434052 (197)	total: 16.5s	remaining: 4.37s
791:	learn:

872:	learn: 0.8777623	test: 0.9616751	best: 0.9434052 (197)	total: 18.2s	remaining: 2.65s
873:	learn: 0.8777163	test: 0.9617355	best: 0.9434052 (197)	total: 18.2s	remaining: 2.63s
874:	learn: 0.8776693	test: 0.9617336	best: 0.9434052 (197)	total: 18.3s	remaining: 2.61s
875:	learn: 0.8776567	test: 0.9617459	best: 0.9434052 (197)	total: 18.3s	remaining: 2.59s
876:	learn: 0.8776285	test: 0.9622947	best: 0.9434052 (197)	total: 18.3s	remaining: 2.57s
877:	learn: 0.8775876	test: 0.9622892	best: 0.9434052 (197)	total: 18.3s	remaining: 2.55s
878:	learn: 0.8775648	test: 0.9623045	best: 0.9434052 (197)	total: 18.3s	remaining: 2.52s
879:	learn: 0.8775325	test: 0.9621705	best: 0.9434052 (197)	total: 18.4s	remaining: 2.5s
880:	learn: 0.8774626	test: 0.9621298	best: 0.9434052 (197)	total: 18.4s	remaining: 2.48s
881:	learn: 0.8774391	test: 0.9621365	best: 0.9434052 (197)	total: 18.4s	remaining: 2.46s
882:	learn: 0.8773726	test: 0.9621420	best: 0.9434052 (197)	total: 18.4s	remaining: 2.44s
883:	learn:

973:	learn: 0.8739941	test: 0.9659821	best: 0.9434052 (197)	total: 20.3s	remaining: 542ms
974:	learn: 0.8739683	test: 0.9659788	best: 0.9434052 (197)	total: 20.3s	remaining: 521ms
975:	learn: 0.8738971	test: 0.9659693	best: 0.9434052 (197)	total: 20.3s	remaining: 500ms
976:	learn: 0.8738738	test: 0.9659520	best: 0.9434052 (197)	total: 20.4s	remaining: 479ms
977:	learn: 0.8738432	test: 0.9659541	best: 0.9434052 (197)	total: 20.4s	remaining: 458ms
978:	learn: 0.8737987	test: 0.9660350	best: 0.9434052 (197)	total: 20.4s	remaining: 438ms
979:	learn: 0.8737557	test: 0.9660308	best: 0.9434052 (197)	total: 20.4s	remaining: 417ms
980:	learn: 0.8737278	test: 0.9660268	best: 0.9434052 (197)	total: 20.4s	remaining: 396ms
981:	learn: 0.8736974	test: 0.9660636	best: 0.9434052 (197)	total: 20.5s	remaining: 375ms
982:	learn: 0.8736615	test: 0.9660533	best: 0.9434052 (197)	total: 20.5s	remaining: 354ms
983:	learn: 0.8735726	test: 0.9660550	best: 0.9434052 (197)	total: 20.5s	remaining: 333ms
984:	learn

<catboost.core.CatBoostRegressor at 0x7fc685767630>

In [145]:
columns=train.columns
x = model.get_feature_importance(train_data)
for i in range(len(x)):
    print(columns[i], x[i])

customer_id 34.96149121911679
story_id 16.34472019963895
product_0 1.5922536223909105
product_1 2.604247177144562
product_2 3.8938304375572437
product_3 4.171991664207507
product_4 1.511522753195229
product_5 2.3420099706811697
product_6 2.4357608056281443
gender_cd 7.240652514373329
age 8.943400288748121
marital_status_cd 2.4110080270492893
children_cnt 1.7527085207114186
job_position_cd 4.169808770582543
job_title 5.624594028974637


In [130]:
p = model.predict(train_data)

In [140]:
Counter(train_class)


Counter({'skip': 113598, 'view': 160027, 'like': 41995, 'dislike': 4628})

In [146]:
def metric(predicts, Ys):
    counter = Counter(Ys)
    max_value = counter['view']*0.1+counter['skip']*0.1+counter['like']*0.5+counter['dislike']*10
    value = 0
    for y,p in zip(Ys,predicts):
        if y == "dislike":
            value-=10*p
        if y == "like":
            value+=0.5*p
        if y =='view':
            value+=0.1*p
        if y =='skip':
            value-=0.1*p
    return value/max_value
            
            

In [147]:
train_predict = model.predict(train_data)
val_predict = model.predict(val_data)

print('train metric ' + str(metric(train_predict, train_class)))
print('val metric ' + str(metric(val_predict, val_class)))

train metric 0.1827708255608241
val metric 0.10877442625817568
