## WorkFlow

for better reading please install nb.extension and view table of contents

### Load events and EDA

In [44]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [45]:
events = pd.read_csv('events.csv')

In [47]:
# encode user_id
uniq = events['user_id_hash'].unique()
name2idx = {o:i for i,o in enumerate(uniq)}
new_col = np.array([name2idx[x] for x in events['user_id_hash']])
events['user_id_hash'] = new_col

In [50]:
# important time window
oct_7 = datetime.datetime(2018, 10, 7, 23, 59, 59, 0)
oct_14 = datetime.datetime(2018, 10, 14, 23, 59, 59, 0)
oct_21 = datetime.datetime(2018, 10, 21, 23, 59, 59, 0)
oct_28 = datetime.datetime(2018, 10, 28, 23, 59, 59, 0)
nov_4 = datetime.datetime(2018, 11, 4, 23, 59, 59, 0)
nov_11 = datetime.datetime(2018, 11, 11, 23, 59, 59, 0)
nov_18 = datetime.datetime(2018, 11, 18, 23, 59, 59, 0)
nov_25 = datetime.datetime(2018, 11, 25, 23, 59, 59, 0)
dec_1 = datetime.datetime(2018, 12, 1, 23, 59, 59, 0)
dec_7 = datetime.datetime(2018, 12, 7, 23, 59, 59, 0)
dec_14 = datetime.datetime(2018, 12, 14, 23, 59, 59, 0)

In [51]:
# date time to milliseconds
def to_milli(t):
    return int(t.strftime("%s"))*1000

time point Dec 1st late night: 1543737599000
time point Dec 7th late night: 1544255999000
time point Dec 14th late night: 1544860799000


In [52]:
events = events.drop(['app_id'],axis=1)
events.head(3)

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,0
1,5558845121177764917,45,1542215484895,0.0,0
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,0


#### filter those who bought

In [55]:
bought = events[events['event']=='8'].reset_index()
bought = bought.drop(['index'],axis=1)

In [56]:
bought.head(3)

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5159335150551901701,8,1541912600211,3.493,0
1,845986603555615931,8,1543357433771,3.493,14
2,2846996524173831068,8,1543023539172,3.493,14


In [57]:
print(len(events['user_id_hash'].unique()))
print(len(bought['user_id_hash'].unique()))

621001
34200


### Feature engineeringg

####  time window for those who bought

In [58]:
# https://stackoverflow.com/questions/45752601/python-pandas-conditional-count-after-groupby

bought_1001_1007 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: (x < to_milli(oct_7)).sum()).reset_index(name='count')
print(sum(bought_1001_1007['count']))


bought_1007_1014 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_7))&(x < to_milli(oct_14))).sum()).reset_index(name='count')
print(sum(bought_1007_1014['count']))


bought_1014_1021 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_14))&(x < to_milli(oct_21))).sum()).reset_index(name='count')
print(sum(bought_1014_1021['count']))


bought_1021_1028 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_21))&(x < to_milli(oct_28))).sum()).reset_index(name='count')
print(sum(bought_1021_1028['count']))


bought_1028_1104 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_28))&(x < to_milli(nov_4))).sum()).reset_index(name='count')
print(sum(bought_1028_1104['count']))


bought_1104_1111 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_4))&(x < to_milli(nov_11))).sum()).reset_index(name='count')
print(sum(bought_1104_1111['count']))


bought_1111_1118 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_11))&(x < to_milli(nov_18))).sum()).reset_index(name='count')
print(sum(bought_1111_1118['count']))


bought_1118_1125 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_18))&(x < to_milli(nov_25))).sum()).reset_index(name='count')
print(sum(bought_1118_1125['count']))


bought_1125_1201 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_25))&(x < to_milli(dec_1))).sum()).reset_index(name='count')
print(sum(bought_1125_1201['count']))



12213
22946
27145
29510
31233
30290
30767
31141
23719


#### create target y

In [59]:
# bought after 12.1
# already_bought = []
bought_after_12_1 = bought[bought['event_timestamp']>to_milli(dec_1)]['user_id_hash'].unique()
print(bought_after_12_1[:3])
# bought after 12.1 before 12.7
bought_between_12_1_7 = bought[(bought['event_timestamp']>to_milli(dec_1)) & (bought['event_timestamp']<to_milli(dec_7))]['user_id_hash'].unique()
print(bought_between_12_1_7[:3])

[ 86 396 402]
[ 86 396 402]


In [60]:
print(len(bought_after_12_1))
print(len(bought_between_12_1_7))

5398
4001


#### create X and adding features

In [195]:
df = events['user_id_hash'].unique()
df = pd.DataFrame(data = df, columns=['user_id_hash'])

In [196]:
df['dec1_14'] = df['user_id_hash'].apply(lambda x: 1 if x in bought_after_12_1 else 0)
df['dec1_7'] = df['user_id_hash'].apply(lambda x: 1 if x in bought_between_12_1_7 else 0)

In [64]:
df.head(3)

Unnamed: 0,user_id_hash,dec1_14,dec1_7
0,0,0,0
1,1,0,0
2,2,0,0


In [65]:
print(sum(df['dec1_7']))
print(sum(df['dec1_14']))

4001
5398


In [197]:
len(df)

621001

In [194]:
# bought_1125_1201

In [198]:
df = df.merge(bought_1001_1007, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1001_1007"})
df = df.merge(bought_1007_1014, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1007_1014"})
df = df.merge(bought_1014_1021, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1014_1021"})
df = df.merge(bought_1021_1028, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1021_1028"})
df = df.merge(bought_1028_1104, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1028_1104"})
df = df.merge(bought_1104_1111, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1104_1111"})
df = df.merge(bought_1111_1118, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1111_1118"})
df = df.merge(bought_1118_1125, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1118_1125"})
df = df.merge(bought_1125_1201, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "count_1125_1201"})

In [199]:
len(df)

621001

In [69]:
# df.head(50)
df.head(2)

Unnamed: 0,user_id_hash,dec1_14,dec1_7,count_1001_1007,count_1007_1014,count_1014_1021,count_1021_1028,count_1028_1104,count_1104_1111,count_1111_1118,count_1118_1125,count_1125_1201
0,0,0,0,0,0,0,0,0,1,0,0,0
1,14,0,0,0,0,0,0,0,0,0,1,2


#### check performance of first set of features

In [71]:
X = df.drop(['dec1_7','dec1_14'], axis=1)
y = df['dec1_7']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf_dec1_7 = GradientBoostingClassifier()
clf_dec1_7.fit(X_train, y_train)
y_scores = clf_dec1_7.predict_proba(X_test)
roc_auc_score(y_test, y_scores[:,1])
# len(y_scores[:,1])

0.9843744038420481

In [72]:
X = df.drop(['dec1_7','dec1_14'], axis=1)
y = df['dec1_14']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf_dec1_14 = GradientBoostingClassifier()
clf_dec1_14.fit(X_train, y_train)
y_scores = clf_dec1_14.predict_proba(X_test)[:,1]
y_scores = [1 if i>=0.5 else 0 for i in y_scores]
roc_auc_score(y_test, y_scores)
# len(y_scores[:,1])

0.8734261027764274

#### create second set of features

In [73]:
# https://stackoverflow.com/questions/45752601/python-pandas-conditional-count-after-groupby

events_1001_1007 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: (x < to_milli(oct_7)).sum()).reset_index(name='count')
print(sum(events_1001_1007['count']))


events_1007_1014 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_7))&(x < to_milli(oct_14))).sum()).reset_index(name='count')
print(sum(events_1007_1014['count']))


events_1014_1021 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_14))&(x < to_milli(oct_21))).sum()).reset_index(name='count')
print(sum(events_1014_1021['count']))


events_1021_1028 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_21))&(x < to_milli(oct_28))).sum()).reset_index(name='count')
print(sum(events_1021_1028['count']))


events_1028_1104 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(oct_28))&(x < to_milli(nov_4))).sum()).reset_index(name='count')
print(sum(events_1028_1104['count']))


events_1104_1111 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_4))&(x < to_milli(nov_11))).sum()).reset_index(name='count')
print(sum(events_1104_1111['count']))


events_1111_1118 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_11))&(x < to_milli(nov_18))).sum()).reset_index(name='count')
print(sum(events_1111_1118['count']))


events_1118_1125 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_18))&(x < to_milli(nov_25))).sum()).reset_index(name='count')
print(sum(events_1118_1125['count']))


events_1125_1201 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(nov_25))&(x < to_milli(dec_1))).sum()).reset_index(name='count')
print(sum(events_1125_1201['count']))



5854801
9080303
10927830
11997705
12515096
12619315
30767
13602230
9891204


In [74]:
events_1125_1201.head(3)

Unnamed: 0,user_id_hash,count
0,0,51
1,1,0
2,2,0


#### add second set of features

In [200]:
df = df.merge(events_1001_1007, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1001_1007"})
df = df.merge(events_1007_1014, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1007_1014"})
df = df.merge(events_1014_1021, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1014_1021"})
df = df.merge(events_1021_1028, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1021_1028"})
df = df.merge(events_1028_1104, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1028_1104"})
df = df.merge(events_1104_1111, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1104_1111"})
df = df.merge(events_1111_1118, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1111_1118"})
df = df.merge(events_1118_1125, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1118_1125"})
df = df.merge(events_1125_1201, left_on='user_id_hash', right_on='user_id_hash', how='left')
df = df.rename(index=str, columns={"count": "events_1125_1201"})

In [76]:
df.head(20)

Unnamed: 0,user_id_hash,dec1_14,dec1_7,count_1001_1007,count_1007_1014,count_1014_1021,count_1021_1028,count_1028_1104,count_1104_1111,count_1111_1118,count_1118_1125,count_1125_1201,events_1001_1007,events_1007_1014,events_1014_1021,events_1021_1028,events_1028_1104,events_1104_1111,events_1111_1118,events_1118_1125,events_1125_1201
0,0,0,0,0,0,0,0,0,1,0,0,0,109,124,32,0,4,157,0,28,51
1,14,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,78,125
2,17,0,0,0,0,0,0,0,2,0,0,0,0,0,0,8,0,162,0,0,0
3,39,0,0,0,0,0,0,7,3,0,0,0,0,0,0,0,489,274,0,0,0
4,41,0,0,0,0,2,4,3,0,0,0,0,0,0,202,288,357,0,0,0,0
5,47,0,0,0,0,0,0,1,0,0,0,0,0,0,0,154,192,0,0,1,0
6,49,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,128,0,0,0
7,62,0,0,0,0,0,0,7,3,4,0,0,0,0,0,0,368,149,4,0,0
8,64,0,0,0,1,1,2,6,2,4,0,6,0,242,1085,860,1374,964,4,529,834
9,86,1,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,53,0


#### check second set of features performance

In [202]:
df = df.fillna(0)
X = df.drop(['dec1_7','dec1_14'], axis=1)
y = df['dec1_7']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf_dec1_7 = GradientBoostingClassifier()
clf_dec1_7.fit(X_train, y_train)
y_scores = clf_dec1_7.predict_proba(X_test)
roc_auc_score(y_test, y_scores[:,1])
# len(y_scores[:,1])

0.9659353910035892

In [203]:
X = df.drop(['dec1_7','dec1_14'], axis=1)
y = df['dec1_14']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf_dec1_14 = GradientBoostingClassifier()
clf_dec1_14.fit(X_train, y_train)
y_scores = clf_dec1_14.predict_proba(X_test)[:,1]
# y_scores = [1 if i>=0.99 elif 0 for i in y_scores]
# for i in range(len(y_scores)):
#     if y_scores[i]>0.99:
#         y_scores[i]=1
#     elif y_scores[i]<0.01:
#         y_scores[i]=0
roc_auc_score(y_test, y_scores)
# len(y_scores[:,1])

0.9581662148509528

### Load sessions and EDA

In [3]:
sessions = pd.read_csv("sessions.csv")

In [4]:
pd.set_option('display.max_columns', 500)

In [5]:
sessions['app_id'].unique()

array([4724682771660800])

In [6]:
sessions = sessions.drop(['app_id'], axis=1)

In [7]:
sessions.head(3)

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,is_session,is_developer,is_wau,is_mau,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash
0,5558845121177764917,1542215364580,Asia/Manila,28800000.0,25837591,1538874289458,False,True,False,False,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,30,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,2201961907282901522,1543712977293,Asia/Manila,28800000.0,35050130,1538874289458,False,True,False,False,False,PH,0,quezon city,14.676041,121.043701,en_GB,Android OS,47,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,18781111175537580,1539215568666,Asia/Manila,28800000.0,11343848,1538874289458,False,True,False,False,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,10,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [8]:
len(sessions['user_id_hash'].unique())

621106

In [9]:
uniq = sessions['user_id_hash'].unique()
name2idx = {o:i for i,o in enumerate(uniq)}
new_col = np.array([name2idx[x] for x in sessions['user_id_hash']])
sessions['user_id_hash'] = new_col

In [10]:
sessions.head(3)

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,is_session,is_developer,is_wau,is_mau,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash
0,5558845121177764917,1542215364580,Asia/Manila,28800000.0,25837591,1538874289458,False,True,False,False,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,30,546a3d98-d540-4e72-ad82-9ebd64e0839b,0
1,2201961907282901522,1543712977293,Asia/Manila,28800000.0,35050130,1538874289458,False,True,False,False,False,PH,0,quezon city,14.676041,121.043701,en_GB,Android OS,47,546a3d98-d540-4e72-ad82-9ebd64e0839b,0
2,18781111175537580,1539215568666,Asia/Manila,28800000.0,11343848,1538874289458,False,True,False,False,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,10,546a3d98-d540-4e72-ad82-9ebd64e0839b,0


In [11]:
sessions = sessions[(sessions.is_session == True) & (sessions.is_wau == False) & (sessions.is_mau == False) & (sessions.is_developer == False)]
sessions = sessions.drop(['is_session','is_developer','is_wau','is_mau'], axis=1)


In [12]:
sessions.head(3)

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash
0,5558845121177764917,1542215364580,Asia/Manila,28800000.0,25837591,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,30,546a3d98-d540-4e72-ad82-9ebd64e0839b,0
1,2201961907282901522,1543712977293,Asia/Manila,28800000.0,35050130,1538874289458,False,PH,0,quezon city,14.676041,121.043701,en_GB,Android OS,47,546a3d98-d540-4e72-ad82-9ebd64e0839b,0
2,18781111175537580,1539215568666,Asia/Manila,28800000.0,11343848,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,10,546a3d98-d540-4e72-ad82-9ebd64e0839b,0


In [13]:
oct_7 = datetime.datetime(2018, 10, 7, 23, 59, 59, 0)
oct_14 = datetime.datetime(2018, 10, 14, 23, 59, 59, 0)
oct_21 = datetime.datetime(2018, 10, 21, 23, 59, 59, 0)
oct_28 = datetime.datetime(2018, 10, 28, 23, 59, 59, 0)
nov_4 = datetime.datetime(2018, 11, 4, 23, 59, 59, 0)
nov_11 = datetime.datetime(2018, 11, 11, 23, 59, 59, 0)
nov_18 = datetime.datetime(2018, 11, 18, 23, 59, 59, 0)
nov_25 = datetime.datetime(2018, 11, 25, 23, 59, 59, 0)
dec_1 = datetime.datetime(2018, 12, 1, 23, 59, 59, 0)
dec_7 = datetime.datetime(2018, 12, 7, 23, 59, 59, 0)
dec_14 = datetime.datetime(2018, 12, 14, 23, 59, 59, 0)

In [14]:
len(sessions['user_id_hash'].unique())

619519

### Second round of feature engineering

In [81]:
# https://stackoverflow.com/questions/38174155/group-dataframe-and-get-sum-and-count/38174164


# user_num_session_creation = sessions.groupby('user_id_hash').aggregate(['sum','count'])['session_id'].reset_index()
# user_num_session_creation = user_num_session_creation.rename(index=str, columns={"count":"_"})
sessions_before_12_1 = sessions[sessions['start_timestamp']<to_milli(dec_1)]
user_num_ses_cre_and_tot_ses_time = sessions_before_12_1.groupby('user_id_hash')['previous_sessions_duration']\
                                                            .agg(['sum','count']).reset_index()

user_num_ses_cre_and_tot_ses_time['sum'] = user_num_ses_cre_and_tot_ses_time['sum']/60

user_num_ses_cre_and_tot_ses_time.head(5)
# .size().reset_index(name='counts')


Unnamed: 0,user_id_hash,sum,count
0,0,16833960.0,48
1,1,101246.9,3
2,2,0.0,1
3,3,0.0,1
4,4,0.0,1


In [82]:
# user_num_day_cre = sessions[['user_id_hash','user_created_timestamp']].drop_duplicates(sub)
# print(len(user_num_day_cre))
# print(len(user_num_day_cre['user_id_hash'].unique()))

In [83]:
# user_num_day_cre.groupby('user_id_hash').count()\
#                 .rename(index=str, columns={"user_created_timestamp": "count"}).reset_index()\
#                 .sort_values(by=['count'],ascending=False).head(5)

In [317]:
# user_num_day_cre[user_num_day_cre['user_id_hash']==561382]

Unnamed: 0,user_id_hash,user_created_timestamp
5634945,561382,1541769723451
5634946,561382,1541769740176


In [None]:
# user_num_day_cre.pivot(index='user_id_hash', columns='user_created_timestamp')

In [84]:
user_num_day_cre = sessions_before_12_1[['user_id_hash','user_created_timestamp']]\
                    .drop_duplicates(subset='user_id_hash', keep='first')\
                    .reset_index(drop=True)
print(len(user_num_day_cre))
print(len(user_num_day_cre['user_id_hash'].unique()))

619037
619037


In [85]:
user_num_ses_cre_and_tot_ses_time.head(3)

Unnamed: 0,user_id_hash,sum,count
0,0,16833960.0,48
1,1,101246.9,3
2,2,0.0,1


In [86]:
# https://stackoverflow.com/questions/7829571/milliseconds-to-days
# int days = (int) ((milliseconds / (1000*60*60*24)) % 7);

user_num_day_cre['num_day_created'] = ((to_milli(dec_1)-user_num_day_cre['user_created_timestamp'])/(1000*60*60*24)) % 7
user_num_day_cre.head(3)

Unnamed: 0,user_id_hash,user_created_timestamp,num_day_created
0,0,1538874289458,0.288305
1,1,1541886003916,0.430499
2,2,1539830331932,3.222998


In [87]:
user_ave = user_num_ses_cre_and_tot_ses_time.merge(user_num_day_cre, left_on='user_id_hash', right_on='user_id_hash')
# df = df.rename(index=str, columns={"count": "events_1125_1201"})
user_ave = user_ave.drop(['user_created_timestamp'],axis=1)
user_ave.head(3)



Unnamed: 0,user_id_hash,sum,count,num_day_created
0,0,16833960.0,48,0.288305
1,1,101246.9,3,0.430499
2,2,0.0,1,3.222998


In [88]:
user_ave['average_minutes'] = user_ave['sum']/user_ave['num_day_created']
user_ave['average_counts'] = user_ave['count']/user_ave['num_day_created']
user_ave.head(3)

Unnamed: 0,user_id_hash,sum,count,num_day_created,average_minutes,average_counts
0,0,16833960.0,48,0.288305,58389440.0,166.490416
1,1,101246.9,3,0.430499,235185.2,6.968663
2,2,0.0,1,3.222998,0.0,0.31027


In [89]:
user_ave = user_ave.drop(['sum','count'], axis=1)
user_ave.head(3)

Unnamed: 0,user_id_hash,num_day_created,average_minutes,average_counts
0,0,0.288305,58389440.0,166.490416
1,1,0.430499,235185.2,6.968663
2,2,3.222998,0.0,0.31027


#### add third round of features

In [204]:
df = df.merge(user_ave, left_on='user_id_hash', right_on='user_id_hash')
# df = df.rename(index=str, columns={"count": "events_1125_1201"})

In [205]:
df.drop(['dec1_7','dec1_14'], axis=1).iloc[:,1:].head()

Unnamed: 0,count_1001_1007,count_1007_1014,count_1014_1021,count_1021_1028,count_1028_1104,count_1104_1111,count_1111_1118,count_1118_1125,count_1125_1201,events_1001_1007,events_1007_1014,events_1014_1021,events_1021_1028,events_1028_1104,events_1104_1111,events_1111_1118,events_1118_1125,events_1125_1201,num_day_created,average_minutes,average_counts
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,109,124,32,0,4,157,0.0,28,51,6.288305,2462272.0,5.883939
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,86,0.0,0,0,6.430499,15744.8,0.466527
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,33,0,0,0,0.0,0,0,2.222998,0.0,0.449843
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,1,0,6.057656,0.0,0.16508
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,8,0,0,0.0,0,0,4.672814,0.0,0.214004


#### see third features performance

In [234]:
X = df.drop(['dec1_7','dec1_14'], axis=1).iloc[:,1:]
y = df['dec1_7']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf_dec1_7 = GradientBoostingClassifier(max_depth=6, min_samples_leaf=5, loss='exponential')
clf_dec1_7.fit(X_train, y_train)
y_scores = clf_dec1_7.predict_proba(X_test)
roc_auc_score(y_test, y_scores[:,1])

0.9804802346557037

In [235]:
X = df.drop(['dec1_7','dec1_14'], axis=1).iloc[:,1:]
y = df['dec1_14']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf_dec1_14 = GradientBoostingClassifier(max_depth=6, min_samples_leaf=5,loss='exponential')
clf_dec1_14.fit(X_train, y_train)
y_scores = clf_dec1_14.predict_proba(X_test)[:,1]
# y_scores = [1 if i>=0.99 elif 0 for i in y_scores]
# for i in range(len(y_scores)):
#     if y_scores[i]>0.99:
#         y_scores[i]=1
#     elif y_scores[i]<0.01:
#         y_scores[i]=0
roc_auc_score(y_test, y_scores)
# len(y_scores[:,1])

0.9758971999663928

In [193]:
len(X)

34173

In [238]:
X.head()

Unnamed: 0,count_1001_1007,count_1007_1014,count_1014_1021,count_1021_1028,count_1028_1104,count_1104_1111,count_1111_1118,count_1118_1125,count_1125_1201,events_1001_1007,events_1007_1014,events_1014_1021,events_1021_1028,events_1028_1104,events_1104_1111,events_1111_1118,events_1118_1125,events_1125_1201,num_day_created,average_minutes,average_counts
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,109,124,32,0,4,157,0.0,28,51,6.288305,2462272.0,5.883939
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,86,0.0,0,0,6.430499,15744.8,0.466527
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,33,0,0,0,0.0,0,0,2.222998,0.0,0.449843
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,1,0,6.057656,0.0,0.16508
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,8,0,0,0.0,0,0,4.672814,0.0,0.214004


In [236]:
clf_dec1_7.fit(X, y)
clf_dec1_14.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

### Grid Search this is manually tuned

In [259]:
# from sklearn.model_selection import GridSearchCV

# param_grid = dict(loss = ['deviance', 'exponential'],
#                   learning_rate=[0.1, 0.4, 0.7],
#                   n_estimators = [50,100],
#                   subsample = [0.5, 0.75, 1],
#                   min_samples_split = [2,5,50],
#                   max_depth = [3,5])

# gbc_cv = GridSearchCV(estimator=GradientBoostingClassifier(), 
#                       param_grid=param_grid, 
#                       cv=5)
# gbc_cv.fit(X, y)
# print(gbc_cv.best_params_)
# print(f"{gbc_cv.best_score_:,.4f}")

### Prediction

In [113]:
dec_1 = datetime.datetime(2018, 12, 1, 23, 59, 59, 0)
dec_7 = datetime.datetime(2018, 12, 7, 23, 59, 59, 0)
dec_8 = datetime.datetime(2018, 12, 8, 23, 59, 59, 0)
dec_14 = datetime.datetime(2018, 12, 14, 23, 59, 59, 0)

In [114]:
bought_1201_1208 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(dec_1))&(x < to_milli(dec_8))).sum()).reset_index(name='count')
print(sum(bought_1118_1125['count']))


bought_1207_1214 = bought.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(dec_7))&(x < to_milli(dec_14))).sum()).reset_index(name='count')
print(sum(bought_1125_1201['count']))

31141
23719


#### build and merging features for testing

In [208]:
df2 = events['user_id_hash'].unique()
df2 = pd.DataFrame(data = df2, columns=['user_id_hash'])

In [209]:
len(df2)

621001

In [210]:
df2 = df2.merge(bought_1014_1021, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1001_1007"})
df2 = df2.merge(bought_1021_1028, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1007_1014"})
df2 = df2.merge(bought_1028_1104, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1014_1021"})
df2 = df2.merge(bought_1104_1111, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1021_1028"})
df2 = df2.merge(bought_1111_1118, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1028_1104"})
df2 = df2.merge(bought_1118_1125, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1104_1111"})
df2 = df2.merge(bought_1125_1201, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1111_1118"})
df2 = df2.merge(bought_1201_1208, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1118_1125"})
df2 = df2.merge(bought_1207_1214, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "count_1125_1201"})

In [211]:
len(df2)

621001

In [118]:
events_1201_1208 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(dec_1))&(x < to_milli(dec_8))).sum()).reset_index(name='count')
print(sum(events_1118_1125['count']))


events_1207_1214 = events.groupby('user_id_hash')['event_timestamp']\
                        .apply(lambda x: ((x > to_milli(dec_7))&(x < to_milli(dec_14))).sum()).reset_index(name='count')
print(sum(events_1125_1201['count']))


13602230
9891204


In [212]:
df2 = df2.merge(events_1014_1021, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1001_1007"})
df2 = df2.merge(events_1021_1028, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1007_1014"})
df2 = df2.merge(events_1028_1104, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1014_1021"})
df2 = df2.merge(events_1104_1111, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1021_1028"})
df2 = df2.merge(events_1111_1118, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1028_1104"})
df2 = df2.merge(events_1118_1125, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1104_1111"})
df2 = df2.merge(events_1125_1201, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1111_1118"})
df2 = df2.merge(events_1201_1208, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1118_1125"})
df2 = df2.merge(events_1207_1214, left_on='user_id_hash', right_on='user_id_hash',how='left')
df2 = df2.rename(index=str, columns={"count": "events_1125_1201"})

In [120]:
session_after_10_14 = sessions[sessions['start_timestamp']>to_milli(oct_14)]

In [121]:
user_num_ses_cre_and_tot_ses_time = session_after_10_14.groupby('user_id_hash')['previous_sessions_duration']\
                                                            .agg(['sum','count']).reset_index()

user_num_ses_cre_and_tot_ses_time['sum'] = user_num_ses_cre_and_tot_ses_time['sum']/60

user_num_ses_cre_and_tot_ses_time.head(5)

Unnamed: 0,user_id_hash,sum,count
0,0,15483520.0,37
1,1,101246.9,3
2,2,0.0,1
3,3,0.0,1
4,4,0.0,1


In [122]:
user_num_day_cre = session_after_10_14[['user_id_hash','user_created_timestamp']]\
                    .drop_duplicates(subset='user_id_hash', keep='first')\
                    .reset_index(drop=True)
print(len(user_num_day_cre))
print(len(user_num_day_cre['user_id_hash'].unique()))

517219
517219


In [123]:
user_num_day_cre['num_day_created'] = ((to_milli(dec_14)-user_num_day_cre['user_created_timestamp'])/(1000*60*60*24)) % 7
user_num_day_cre.head(3)

Unnamed: 0,user_id_hash,user_created_timestamp,num_day_created
0,0,1538874289458,6.288305
1,1,1541886003916,6.430499
2,2,1539830331932,2.222998


In [124]:
user_ave = user_num_ses_cre_and_tot_ses_time.merge(user_num_day_cre, left_on='user_id_hash', right_on='user_id_hash')
user_ave = user_ave.drop(['user_created_timestamp'],axis=1)
user_ave.head(3)

Unnamed: 0,user_id_hash,sum,count,num_day_created
0,0,15483520.0,37,6.288305
1,1,101246.9,3,6.430499
2,2,0.0,1,2.222998


In [125]:
user_ave['average_minutes'] = user_ave['sum']/user_ave['num_day_created']
user_ave['average_counts'] = user_ave['count']/user_ave['num_day_created']
user_ave.head(3)

Unnamed: 0,user_id_hash,sum,count,num_day_created,average_minutes,average_counts
0,0,15483520.0,37,6.288305,2462272.0,5.883939
1,1,101246.9,3,6.430499,15744.8,0.466527
2,2,0.0,1,2.222998,0.0,0.449843


In [126]:
user_ave = user_ave.drop(['sum','count'], axis=1)
user_ave.head(3)

Unnamed: 0,user_id_hash,num_day_created,average_minutes,average_counts
0,0,6.288305,2462272.0,5.883939
1,1,6.430499,15744.8,0.466527
2,2,2.222998,0.0,0.449843


In [213]:
df2 = df2.merge(user_ave, left_on='user_id_hash', right_on='user_id_hash',how='left')

In [214]:
df2.head(3)

Unnamed: 0,user_id_hash,count_1001_1007,count_1007_1014,count_1014_1021,count_1021_1028,count_1028_1104,count_1104_1111,count_1111_1118,count_1118_1125,count_1125_1201,events_1001_1007,events_1007_1014,events_1014_1021,events_1021_1028,events_1028_1104,events_1104_1111,events_1111_1118,events_1118_1125,events_1125_1201,num_day_created,average_minutes,average_counts
0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,32,0,4,157,0.0,28,51,0,0,6.288305,2462272.0,5.883939
1,1,,,,,,,,,,0,0,0,86,,0,0,0,0,6.430499,15744.8,0.466527
2,2,,,,,,,,,,33,0,0,0,,0,0,0,0,2.222998,0.0,0.449843


In [215]:
len(df2)

621001

In [216]:
df2 = df2.fillna(0)

In [241]:
df2 = df2.drop(['pred_7','pred_14'], axis=1)

In [242]:
df2.iloc[:,1:].head()

Unnamed: 0,count_1001_1007,count_1007_1014,count_1014_1021,count_1021_1028,count_1028_1104,count_1104_1111,count_1111_1118,count_1118_1125,count_1125_1201,events_1001_1007,events_1007_1014,events_1014_1021,events_1021_1028,events_1028_1104,events_1104_1111,events_1111_1118,events_1118_1125,events_1125_1201,num_day_created,average_minutes,average_counts
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,32,0,4,157,0.0,28,51,0,0,6.288305,2462272.0,5.883939
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,86,0.0,0,0,0,0,6.430499,15744.8,0.466527
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33,0,0,0,0.0,0,0,0,0,2.222998,0.0,0.449843
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,1,0,0,0,6.057656,0.0,0.16508
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,8,0,0,0.0,0,0,0,0,4.672814,0.0,0.214004


#### predict using previous models

In [243]:
df2['pred_7'] = clf_dec1_7.predict_proba(df2.iloc[:,1:])[:,1]
df2['pred_14'] = clf_dec1_14.predict_proba(df2.iloc[:,1:-1])[:,1]

In [218]:
df2.head(3)

Unnamed: 0,user_id_hash,count_1001_1007,count_1007_1014,count_1014_1021,count_1021_1028,count_1028_1104,count_1104_1111,count_1111_1118,count_1118_1125,count_1125_1201,events_1001_1007,events_1007_1014,events_1014_1021,events_1021_1028,events_1028_1104,events_1104_1111,events_1111_1118,events_1118_1125,events_1125_1201,num_day_created,average_minutes,average_counts,pred_7,pred_14
0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,32,0,4,157,0.0,28,51,0,0,6.288305,2462272.0,5.883939,0.003783,0.017206
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,86,0.0,0,0,0,0,6.430499,15744.8,0.466527,2.4e-05,0.000103
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33,0,0,0,0.0,0,0,0,0,2.222998,0.0,0.449843,4e-06,2.2e-05


In [244]:
answer = df2[['user_id_hash','pred_7','pred_14']]

In [245]:
answer.head(3)

Unnamed: 0,user_id_hash,pred_7,pred_14
0,0,0.020371,0.014972
1,1,0.000195,0.000195
2,2,1e-05,1e-05


In [246]:
len(answer)

621001

#### join output and ready to submit to kaggle

In [247]:
submission = pd.read_csv('sample_submission_2.csv')
submission.head(3)

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02


In [248]:
def tryconvert(value):
    try:
        return name2idx[value]
    except KeyError:
        return value

In [250]:
submission['user'] = submission['user_id_hash'].apply(lambda x: tryconvert(x))
submission.head(3)

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,user
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02,415160
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02,536100
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02,343300


In [251]:
answer = answer.rename(index=str, columns={"user_id_hash": "user"})
answer.head(3)

Unnamed: 0,user,pred_7,pred_14
0,0,0.020371,0.014972
1,1,0.000195,0.000195
2,2,1e-05,1e-05


In [255]:
len(answer)

621001

In [252]:
final = submission.merge(answer, on='user', how='left').fillna(0)
final.head(3)

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,user,pred_7,pred_14
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02,415160,0.000598,0.000598
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02,536100,1.3e-05,1.3e-05
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02,343300,1e-05,1e-05


In [253]:
final = final.drop(['user','user_purchase_binary_7_days','user_purchase_binary_14_days'],axis=1)
final = final.rename(index=str, columns={"pred_7": "user_purchase_binary_7_days", "pred_14":"user_purchase_binary_14_days"})
final.to_csv("final.csv",index=False)

In [254]:
final = pd.read_csv('final.csv')
final.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.000598,0.000598
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,1.3e-05,1.3e-05
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,1e-05,1e-05
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.005836,0.005836
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,3e-06,3e-06
