In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, TimeDistributed, Flatten, GRU
from tensorflow.keras.optimizers import Adam, SGD

In [89]:
data = pd.read_parquet('full_dataset.parquet', engine='pyarrow')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
data.head()

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
0,182576,0,2020-09-22 14:38:14,1_5,2,,,2020-09-24 02:03:53
1,182576,0,2020-09-22 14:38:14,1_4,2_9,3_5,,2020-09-24 02:03:53
2,182576,0,2020-09-22 11:46:56,1_4,2_9,3_6,,2020-09-24 02:03:53
3,182576,0,2020-09-22 11:23:50,1_4,2_9,3_12,,2020-09-24 02:03:53
4,182576,0,2020-09-22 11:21:29,1_4,2_9,3_6,,2020-09-24 02:03:53


In [32]:
data['training_sample'] = data['session_id'].isin(train['session_id'])

In [25]:
data[data['training_sample'] == True]['session_id'].nunique()

208486

In [65]:
user0 = data[data['user_identifier'] == 0].sort_values(by=['tobi_timestamp', 'session_id','event_timestamp','training_sample', 'event_category_idx_lv2'],
                                                       ascending=[True, True, True, False, False])

In [66]:
user0['session_id'].isin(train['session_id'])

1941    True
1940    True
1939    True
1938    True
1937    True
        ... 
5       True
4       True
3       True
2       True
1       True
Name: session_id, Length: 1874, dtype: bool

In [116]:
user0

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp,training_sample
1941,940424,0,2020-06-01 13:54:51,1_4,2_9,3_5,,2020-06-02 12:30:50,True
1940,940424,0,2020-06-01 14:34:00,1_4,2_9,3_5,,2020-06-02 12:30:50,True
1939,940424,0,2020-06-01 14:54:31,1_4,2_9,3_6,,2020-06-02 12:30:50,True
1938,940424,0,2020-06-01 20:40:38,1_4,2_9,3_5,,2020-06-02 12:30:50,True
1937,940424,0,2020-06-01 20:44:58,1_4,2_9,3_6,,2020-06-02 12:30:50,True
...,...,...,...,...,...,...,...,...,...
5,182576,0,2020-09-22 10:58:23,1_4,2_9,3_12,,2020-09-24 02:03:53,True
4,182576,0,2020-09-22 11:21:29,1_4,2_9,3_6,,2020-09-24 02:03:53,True
3,182576,0,2020-09-22 11:23:50,1_4,2_9,3_12,,2020-09-24 02:03:53,True
2,182576,0,2020-09-22 11:46:56,1_4,2_9,3_6,,2020-09-24 02:03:53,True


In [109]:
user0.groupby('session_id').apply(np.array)

session_id
1788      [[1788, 0, 2020-06-14 01:07:33, 1_4, 2_9, 3_12...
2075      [[2075, 0, 2020-06-03 10:22:32, 1_4, 2_9, 3_6,...
6358      [[6358, 0, 2020-07-23 20:40:08, 1_4, 2_9, 3_12...
6398      [[6398, 0, 2020-06-03 23:37:08, 1_4, 2_9, 3_13...
11453     [[11453, 0, 2020-07-23 20:40:08, 1_4, 2_9, 3_1...
                                ...                        
739452    [[739452, 0, 2020-06-01 13:54:51, 1_4, 2_9, 3_...
741630    [[741630, 0, 2020-06-01 13:54:51, 1_4, 2_9, 3_...
756004    [[756004, 0, 2020-06-01 13:54:51, 1_4, 2_9, 3_...
756812    [[756812, 0, 2020-06-01 13:54:51, 1_4, 2_9, 3_...
940424    [[940424, 0, 2020-06-01 13:54:51, 1_4, 2_9, 3_...
Length: 68, dtype: object

In [102]:
def reshape_by_user(dataset):
    n_sequences = dataset['session_id'].nunique()
    max_length = max(dataset['session_id'].value_counts())
    sequences = -np.ones((n_sequences, max_length, 8), dtype=object)
    
    i = 0
    grouped = dataset.groupby('session_id').apply(np.array)
    for seq in grouped:
        l = len(seq)
        sequences[i,:l] = seq[:,:-1]
        i += 1
    return sequences

In [103]:
user_0 = reshape_by_user(user0)

In [115]:
user_0

array([[940424, 0, Timestamp('2020-06-01 13:54:51'), '1_4', '2_9', '3_5',
        None, Timestamp('2020-06-02 12:30:50')],
       [940424, 0, Timestamp('2020-06-01 14:34:00'), '1_4', '2_9', '3_5',
        None, Timestamp('2020-06-02 12:30:50')],
       [940424, 0, Timestamp('2020-06-01 14:54:31'), '1_4', '2_9', '3_6',
        None, Timestamp('2020-06-02 12:30:50')],
       [940424, 0, Timestamp('2020-06-01 20:40:38'), '1_4', '2_9', '3_5',
        None, Timestamp('2020-06-02 12:30:50')],
       [940424, 0, Timestamp('2020-06-01 20:44:58'), '1_4', '2_9', '3_6',
        None, Timestamp('2020-06-02 12:30:50')],
       [940424, 0, Timestamp('2020-06-01 20:47:11'), '1_4', '2_9',
        '3_12', None, Timestamp('2020-06-02 12:30:50')],
       [940424, 0, Timestamp('2020-06-01 20:49:25'), '1_4', '2_9',
        '3_13', None, Timestamp('2020-06-02 12:30:50')],
       [940424, 0, Timestamp('2020-06-01 20:51:40'), '1_4', '2_9', '3_5',
        None, Timestamp('2020-06-02 12:30:50')],
       [940424

In [77]:
int(data.loc[data['session_id'] == 13, ['tobi_timestamp']].nunique())

1

In [52]:
user0[~user0['training_sample']]

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp,training_sample
1929,756004,0,2020-06-01 13:54:51,1_4,2_9,3_5,,2020-06-03 10:11:34,False
1899,756812,0,2020-06-01 13:54:51,1_4,2_9,3_5,,2020-06-03 10:27:00,False
1928,756004,0,2020-06-01 14:34:00,1_4,2_9,3_5,,2020-06-03 10:11:34,False
1898,756812,0,2020-06-01 14:34:00,1_4,2_9,3_5,,2020-06-03 10:27:00,False
1927,756004,0,2020-06-01 14:54:31,1_4,2_9,3_6,,2020-06-03 10:11:34,False
...,...,...,...,...,...,...,...,...,...
34,167384,0,2020-09-22 11:23:50,1_4,2_9,3_12,,2020-09-24 01:53:58,False
157,107474,0,2020-09-22 11:46:56,1_4,2_9,3_6,,2020-09-24 01:39:54,False
33,167384,0,2020-09-22 11:46:56,1_4,2_9,3_6,,2020-09-24 01:53:58,False
156,107474,0,2020-09-22 14:38:14,1_4,2_9,3_5,,2020-09-24 01:39:54,False


In [5]:
(data['tobi_timestamp'] >= data['event_timestamp']).value_counts()

True    5676992
dtype: int64

In [6]:
data[data['event_category_idx_lv1'] == '1_5']['event_category_idx_lv2'].apply(lambda s: s.isnumeric()).value_counts()

True    347767
Name: event_category_idx_lv2, dtype: int64

In [7]:
data['event_category_idx_lv2'].apply(lambda s: s.isnumeric() if isinstance(s, str) else False).value_counts()

False    5329225
True      347767
Name: event_category_idx_lv2, dtype: int64

In [8]:
data['event_category_idx_lv1'].value_counts()

1_0     1821342
1_1      794503
1_2      746991
1_4      577190
1_3      574892
1_5      347767
1_6      277829
1_7      248115
1_8      185776
1_9       60585
1_10      23189
1_11      18813
Name: event_category_idx_lv1, dtype: int64

In [41]:
data['event_category_idx_lv2'].value_counts()

2_0      761330
2_2      714830
2_3      551242
2_1      549528
2_4      339168
          ...  
2_366         1
2_355         1
2_451         1
2_316         1
2_311         1
Name: event_category_idx_lv2, Length: 370, dtype: int64

In [42]:
data['event_category_idx_lv3'].value_counts()

3_0     486973
3_1     308585
3_2     249488
3_3     246024
3_4     179971
         ...  
3_71         2
3_81         2
3_76         2
3_72         1
3_77         1
Name: event_category_idx_lv3, Length: 78, dtype: int64

In [23]:
data['event_category_idx_lv4'].value_counts()

4_0    251586
4_1    224356
4_2    218525
4_3    181721
4_4    128892
4_5     37389
4_6       149
Name: event_category_idx_lv4, dtype: int64

In [3]:
data[data['event_category_idx_lv2'].apply(lambda s: s != None)]['event_category_idx_lv1'].value_counts()

1_0     1821342
1_1      794503
1_2      746991
1_4      577190
1_3      574892
1_5      347767
1_6      277829
1_7      248115
1_10      23189
1_11      18813
Name: event_category_idx_lv1, dtype: int64

In [36]:
data[data['event_category_idx_lv4'].apply(lambda s: s == None)]['event_category_idx_lv3'].value_counts()

3_2     249488
3_4     179971
3_9     146006
3_5     139900
3_6     132281
         ...  
3_71         2
3_81         2
3_75         2
3_72         1
3_77         1
Name: event_category_idx_lv3, Length: 71, dtype: int64

In [19]:
data[data['event_category_idx_lv4'].apply(lambda s: s == None)]['event_category_idx_lv3'].value_counts()

3_2     249488
3_4     179971
3_9     146006
3_5     139900
3_6     132281
         ...  
3_75         2
3_81         2
3_71         2
3_77         1
3_72         1
Name: event_category_idx_lv3, Length: 71, dtype: int64

In [38]:
data[data['event_category_idx_lv3'] == '3_2']['event_category_idx_lv4'].value_counts()

Series([], Name: event_category_idx_lv4, dtype: int64)

In [18]:
data[data['event_category_idx_lv3'].apply(lambda s: s == None)]['event_category_idx_lv2'].value_counts()

2_2      714830
2_3      551242
2_4      339168
2_7      184244
2_6      182341
          ...  
2_375         1
2_313         1
2_354         1
2_335         1
2_346         1
Name: event_category_idx_lv2, Length: 334, dtype: int64

In [117]:
data[data['event_category_idx_lv1'] == '1_10']['event_category_idx_lv2'].value_counts()

2_30    16628
2_46     6561
Name: event_category_idx_lv2, dtype: int64

In [57]:
data[data['event_category_idx_lv4'].apply(lambda s: s == None)]['event_category_idx_lv1'].value_counts()

1_0     1821342
1_2      746991
1_4      577190
1_3      574892
1_5      347767
1_6      277829
1_8      185776
1_9       60585
1_10      23189
1_11      18813
Name: event_category_idx_lv1, dtype: int64

In [118]:
data[data['event_category_idx_lv2'] == '2_30']['event_category_idx_lv3'].value_counts()

3_16    16590
3_52       38
Name: event_category_idx_lv3, dtype: int64

In [121]:
data[data['event_category_idx_lv3'] == '3_16']['event_category_idx_lv2'].value_counts()

2_30    16590
Name: event_category_idx_lv2, dtype: int64

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [63]:
data[data['event_category_idx_lv2'] == '2_22']['event_category_idx_lv1'].value_counts()

1_7    38386
1_1      227
Name: event_category_idx_lv1, dtype: int64

In [120]:
data[data['event_category_idx_lv1'] == '1_1']['event_category_idx_lv2'].value_counts()

2_0      761330
2_10      31499
2_94        588
2_143       242
2_22        227
2_145       149
2_38        124
2_98        113
2_65         72
2_28         45
2_53         39
2_78         24
2_41         24
2_59         15
2_101         7
2_55          2
2_105         2
2_210         1
Name: event_category_idx_lv2, dtype: int64

In [14]:
a = []
for i in data['event_category_idx_lv2'].unique():
    if len(data[data['event_category_idx_lv2'] == i]['event_category_idx_lv1'].unique()) != 1:
        a.append(i)

In [17]:
len(a)

16

In [104]:
data['event_category_idx_lv2'].value_counts().index[~a]

Index(['2_10', '2_22', '2_28', '2_38', '2_41', '2_55', '2_53', '2_59', '2_65',
       '2_78', '2_94', '2_105', '2_98', '2_101', '2_210'],
      dtype='object')

In [55]:
data[data['event_category_idx_lv1'] == '1_7']['event_category_idx_lv2'].value_counts()

2_10     80371
2_22     38386
2_28     24511
2_26     20179
2_38     11908
2_37     10518
2_41      9083
2_45      8686
2_48      8020
2_55      6319
2_50      5562
2_53      5123
2_59      4691
2_63      3372
2_65      3110
2_78      3036
2_94      1523
2_105     1288
2_98      1076
2_101     1068
2_133      204
2_197       47
2_210       26
2_299        6
2_294        2
Name: event_category_idx_lv2, dtype: int64

In [56]:
data[data['event_category_idx_lv2'] == '2_10']['event_category_idx_lv3'].value_counts()

3_1    111870
Name: event_category_idx_lv3, dtype: int64

In [58]:
data[data['event_category_idx_lv3'] == '3_1']['event_category_idx_lv2'].value_counts()

2_0      153474
2_10     111870
2_45       8686
2_38       8113
2_26       6328
2_37       5596
2_50       5538
2_63       3372
2_22       2739
2_28       2103
2_78        484
2_133       204
2_94         68
2_48          8
2_294         2
Name: event_category_idx_lv2, dtype: int64

In [None]:
threes = ['1_2', '1_4', '1_10']
data[data['event_category_idx_lv1'].isin(threes)]['event_category_idx_lv2'].value_counts()

In [51]:
l = []
for i in data['event_category_idx_lv3'].value_counts().index:
    if len(data[data['event_category_idx_lv3'] == i]['event_category_idx_lv2'].unique()) != 1:
        l.append(i)

In [52]:
l

['3_0',
 '3_1',
 '3_2',
 '3_3',
 '3_4',
 '3_9',
 '3_5',
 '3_6',
 '3_12',
 '3_8',
 '3_7',
 '3_13',
 '3_10',
 '3_11',
 '3_14',
 '3_15',
 '3_17',
 '3_18',
 '3_22',
 '3_31',
 '3_27',
 '3_25',
 '3_35',
 '3_34',
 '3_36',
 '3_38',
 '3_42',
 '3_40',
 '3_44',
 '3_49',
 '3_45',
 '3_47',
 '3_43',
 '3_48',
 '3_57',
 '3_58',
 '3_56',
 '3_60',
 '3_62',
 '3_70',
 '3_65',
 '3_63',
 '3_71']

In [53]:
m = []
for j in l:
    n = list(data[data['event_category_idx_lv3'] == j]['event_category_idx_lv2'].value_counts().index)
    m.extend(n)
m = pd.Series(m)

In [31]:
h = []
for j in l:
    n = list(data[data['event_category_idx_lv3'] == j]['event_category_idx_lv1'].unique())
    h.extend(n)
h = pd.Series(h)

In [54]:
m.unique()

array(['2_0', '2_22', '2_28', '2_41', '2_26', '2_48', '2_53', '2_59',
       '2_65', '2_37', '2_38', '2_98', '2_105', '2_101', '2_143', '2_78',
       '2_197', '2_210', '2_10', '2_45', '2_50', '2_63', '2_133', '2_94',
       '2_294', '2_1', '2_14', '2_55', '2_299', '2_13', '2_9', '2_5',
       '2_21'], dtype=object)

In [28]:
m.value_counts()

2_1      31
2_13     24
2_14     14
2_9       5
2_5       5
2_0       5
2_78      4
2_22      4
2_38      4
2_48      3
2_37      3
2_28      3
2_94      3
2_26      3
2_41      2
2_65      2
2_21      2
2_105     2
2_50      2
2_59      2
2_210     1
2_53      1
2_55      1
2_197     1
2_143     1
2_133     1
2_299     1
2_98      1
2_10      1
2_45      1
2_101     1
2_63      1
2_294     1
dtype: int64

In [38]:
data[data['event_category_idx_lv1'] == '1_1']['event_category_idx_lv2'].value_counts()

2_0      761330
2_10      31499
2_94        588
2_143       242
2_22        227
2_145       149
2_38        124
2_98        113
2_65         72
2_28         45
2_53         39
2_78         24
2_41         24
2_59         15
2_101         7
2_55          2
2_105         2
2_210         1
Name: event_category_idx_lv2, dtype: int64

In [4]:
def encode_events(dataset):
    d = {1: ['1_8', '1_9'],
         2: ['1_0', '1_3', '1_5', '1_6', '1_11'],
         3: ['1_10'],
         4: ['1_2', '1_4'],
         5: ['1_1', '1_7']}
    
    level1 = dataset['event_category_idx_lv1']
    dataset.loc[level1.isin(d[2]), 'event_category_idx_lv1'] = dataset.loc[level1.isin(d[2]), 'event_category_idx_lv2']
    dataset.loc[level1.isin(d[3]), 'event_category_idx_lv1'] = dataset.loc[level1.isin(d[3]), 'event_category_idx_lv3']
    dataset.loc[level1.isin(d[4]), 'event_category_idx_lv1'] = dataset.loc[level1.isin(d[4]), 'event_category_idx_lv1'] + dataset.loc[level1.isin(d[4]), 'event_category_idx_lv2'] + dataset.loc[level1.isin(d[4]), 'event_category_idx_lv3']
    dataset.loc[level1.isin(d[5]), 'event_category_idx_lv1'] = dataset.loc[level1.isin(d[5]), 'event_category_idx_lv1'] + dataset.loc[level1.isin(d[5]), 'event_category_idx_lv2'] + dataset.loc[level1.isin(d[5]), 'event_category_idx_lv3'] + dataset.loc[level1.isin(d[5]), 'event_category_idx_lv4']                                                         

In [4]:
def encode_events(dataset):
    d = {1: ['1_8', '1_9'],
         2: ['1_0', '1_3', '1_5', '1_6', '1_11'],
         3: ['1_10']}
    
    level1 = dataset['event_category_idx_lv1'].copy()
    dataset.loc[level1.isin(d[2]), 'event_category_idx_lv1'] = dataset.loc[level1.isin(d[2]), 'event_category_idx_lv2']
    dataset.loc[level1.isin(d[2]), 'event_category_idx_lv2'] = None
    dataset.loc[level1.isin(d[3]), 'event_category_idx_lv1'] = dataset.loc[level1.isin(d[3]), 'event_category_idx_lv3']
    dataset.loc[level1.isin(d[3]), 'event_category_idx_lv2'] = None
    dataset.loc[level1.isin(d[3]), 'event_category_idx_lv3'] = None

In [62]:
train['session_id'].is_monotonic

True

In [63]:
test['session_id'].is_monotonic

True

In [7]:
data.sort_values(by=['user_identifier','tobi_timestamp', 'event_timestamp'], inplace=True)

In [82]:
for user in data['user_identifier'].unique():
    baseline_date = min(data.loc[data['user_identifier'] == user, 'tobi_timestamp'])
    try:
        data.loc[data['user_identifier'] == user, 'tobi_timestamp'] = data.loc[data['user_identifier'] == user, 'tobi_timestamp'] - baseline_date
    except:
        print(user)
        break

1


In [93]:
data['tobi_timestamp'] = (data['tobi_timestamp'] - min(data['tobi_timestamp'])).apply(lambda td: td.delta) * 1e-9

In [97]:
minimum_date = data.groupby('user_identifier', sort=False).min(numeric_only=True)

In [87]:
data.loc[data['user_identifier'] == 1, 'tobi_timestamp'] - pd.Timestamp('2020-06-03 23:36:51')

TypeError: unsupported operand type(s) for -: 'numpy.ndarray' and 'Timestamp'

In [14]:
training_data

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
1941,940424,0,2020-06-01 13:54:51,1_42_93_5,2_9,3_5,,2020-06-02 12:30:50
1940,940424,0,2020-06-01 14:34:00,1_42_93_5,2_9,3_5,,2020-06-02 12:30:50
1939,940424,0,2020-06-01 14:54:31,1_42_93_6,2_9,3_6,,2020-06-02 12:30:50
1938,940424,0,2020-06-01 20:40:38,1_42_93_5,2_9,3_5,,2020-06-02 12:30:50
1937,940424,0,2020-06-01 20:44:58,1_42_93_6,2_9,3_6,,2020-06-02 12:30:50
...,...,...,...,...,...,...,...,...
5676962,1556478,871002,2020-06-01 00:00:00,1_22_13_2,2_1,3_2,,2020-06-08 18:53:55
5676980,1554939,871090,2020-06-28 10:00:00,1_72_373_34_4,2_37,3_3,4_4,2020-07-04 14:56:13
5676982,1576694,871093,2020-06-02 12:36:27,2_2,2_2,,,2020-06-05 11:43:37
5676984,1549128,871095,2020-06-01 17:22:28,1_42_93_13,2_9,3_13,,2020-06-02 16:26:54


In [112]:
train.set_index(train['session_id'], inplace=True)

In [113]:
train

Unnamed: 0_level_0,label,session_id
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,2,6
13,1,13
21,2,21
22,2,22
32,4,32
...,...,...
1606177,3,1606177
1606199,2,1606199
1606217,2,1606217
1606226,1,1606226


In [None]:
train.iloc[data[filt]['session_id'].to_numpy()]

In [None]:
data['label'] = -1
def assign(row):
    row['label'] = train[train['session_id'] == row['session_id']]
# data[data['session_id'].isin(train['session_id'])]
filt = data['session_id'].isin(train['session_id'])
data.loc[filt, 'label'] = train.iloc[data.loc[filt, 'session_id']]

In [67]:
data['label'].value_counts()

-1    5329225
Name: label, dtype: int64

In [61]:
data['session_id'].isin(train['session_id'])

1           True
2           True
3           True
4           True
5           True
           ...  
5676984     True
5676986    False
5676988     True
5676990    False
5676991    False
Name: session_id, Length: 5329225, dtype: bool

In [33]:
def reshape_by_user(dataset): #funziona
    n_users = dataset['user_identifier'].nunique()
    max_length = max(dataset['session_id'].value_counts())
    sequences = -np.ones((n_users, 69, 32), dtype=object)
    labels = np.zeros(n_users)
    
    i = 0
    unique_users = dataset['user_identifier'].unique()
    grouped = dataset.groupby(['user_identifier', 'session_id'], sort=False).apply(np.array)
    for user in unique_users:
        sequences[i,0,:] = user
        j = 1
        length = len(grouped[user])
        for session in grouped[user]:
            l = session.shape[0]
            session = session.reshape((-1, session.shape[-1]))
            sequences[i,j,0] = session[0,-1] - minimum_date.loc[user, 'tobi_timestamp']
            sequences[i,j,1:l+1] = session[:,3]
            
            if j != length:
                sequences[i,j,-1] = train.loc[session[0,0], 'label'] 
            else:
                labels[i] = train.loc[session[0,0], 'label']
            j += 1
        i += 1
    return sequences, labels

In [115]:
def reshape_by_user_test(dataset):
    n_sessions = dataset['session_id'].nunique()
    max_length = max(dataset['session_id'].value_counts())
    sequences = -np.ones((n_sessions, 69, 32), dtype=object)
    labels = np.zeros(n_sessions)
    
    i = 0
    unique_users = dataset['user_identifier'].unique()
    grouped = dataset.groupby(['user_identifier', 'session_id'], sort=False).apply(np.array)
    for user in unique_users:
        for session in grouped[user]:
            sequences[i,0,:] = user
            l = session.shape[0]
            session = session.reshape((-1, session.shape[-1]))
            sequences[i,1,0] = session[0,-1] - minimum_date.loc[user, 'tobi_timestamp']
            sequences[i,1,1:l+1] = session[:,3]
            labels[i] = train.loc[session[0,0], 'label']
            i += 1
    return sequences, labels

In [34]:
sequences, labels = reshape_by_user(training_data)

In [102]:
y_train, y_test_and_val = train_test_split(train, train_size=0.7, random_state=42)
y_test, y_val = train_test_split(y_test_and_val, train_size=0.5, random_state=42)

In [103]:
train_df = training_data[training_data['session_id'].isin(y_train['session_id'])]
test_df = training_data[training_data['session_id'].isin(y_test['session_id'])]
val_df = training_data[training_data['session_id'].isin(y_val['session_id'])]

In [60]:
train_sequences, train_labels = reshape_by_user(train_df)

In [116]:
test_sequences, test_labels = reshape_by_user_test(test_df)

In [70]:
val_sequences, val_labels = reshape_by_user_test(val_df)

In [72]:
val_sequences[1,:,0]

array([0, Timestamp('2020-06-09 12:41:36'), -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=object)

In [64]:
train_sequences[10,:,0]

array([10, Timestamp('2020-06-03 21:51:59'),
       Timestamp('2020-06-03 23:53:05'), Timestamp('2020-06-05 12:09:04'),
       Timestamp('2020-06-06 18:06:58'), Timestamp('2020-06-09 17:47:21'),
       Timestamp('2020-07-03 10:53:44'), Timestamp('2020-07-04 20:09:59'),
       Timestamp('2020-07-24 12:50:23'), -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1], dtype=object)

In [46]:
sequences[0,:,0]

array([0, Timestamp('2020-06-02 12:30:50'),
       Timestamp('2020-06-03 10:24:45'), Timestamp('2020-06-03 23:31:44'),
       Timestamp('2020-06-03 23:37:08'), Timestamp('2020-06-05 08:49:17'),
       Timestamp('2020-06-05 09:02:23'), Timestamp('2020-06-05 12:22:20'),
       Timestamp('2020-06-09 12:41:36'), Timestamp('2020-06-11 17:38:21'),
       Timestamp('2020-06-13 23:37:53'), Timestamp('2020-06-14 00:43:31'),
       Timestamp('2020-06-14 00:56:36'), Timestamp('2020-06-14 01:07:33'),
       Timestamp('2020-06-14 01:18:28'), Timestamp('2020-06-14 01:31:35'),
       Timestamp('2020-06-14 01:44:34'), Timestamp('2020-06-14 01:57:40'),
       Timestamp('2020-06-14 02:24:14'), Timestamp('2020-06-14 02:37:44'),
       Timestamp('2020-06-18 19:13:05'), Timestamp('2020-06-25 12:29:55'),
       Timestamp('2020-06-25 12:34:19'), Timestamp('2020-06-25 12:42:54'),
       Timestamp('2020-06-25 12:45:06'), Timestamp('2020-06-25 12:47:22'),
       Timestamp('2020-06-25 12:55:58'), Timestamp('2020

In [50]:
train

Unnamed: 0_level_0,label,session_id
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,2,6
13,1,13
21,2,21
22,2,22
32,4,32
...,...,...
1606177,3,1606177
1606199,2,1606199
1606217,2,1606217
1606226,1,1606226


In [45]:
labels[0]

1.0

In [42]:
data['tobi_timestamp'] = (data['tobi_timestamp'] - data['event_timestamp']).apply(lambda td: td.days)

In [9]:
filt = data['event_category_idx_lv2'].apply(lambda s: s.isnumeric() if isinstance(s, str) else False)
to_drop = data[filt]

In [10]:
data.drop(index=to_drop.index, inplace=True)

In [11]:
encode_events(data)

In [138]:
d = data.sort_values(by=['event_category_idx_lv1'])['event_category_idx_lv1'].unique()

In [None]:
encoder = LabelEncoder()
encoder.fit(d)

In [145]:
data['event_category_idx_lv1'] = encoder.transform(data['event_category_idx_lv1'])

In [148]:
data.drop(columns=['event_category_idx_lv2', 'event_category_idx_lv3', 'event_category_idx_lv4'], inplace=True)

In [30]:
grouped_by_user = data.groupby(['user_identifier', 'session_id'], sort=False).apply(np.array)

In [77]:
grouped_by_user[0]


session_id
940424    [[940424, 0, 2020-06-01 13:54:51, 1_42_93_5, 2...
756004    [[756004, 0, 2020-06-01 13:54:51, 1_42_93_5, 2...
724114    [[724114, 0, 2020-06-01 13:54:51, 1_42_93_5, 2...
756812    [[756812, 0, 2020-06-01 13:54:51, 1_42_93_5, 2...
739452    [[739452, 0, 2020-06-01 13:54:51, 1_42_93_5, 2...
                                ...                        
90303     [[90303, 0, 2020-07-23 20:40:08, 1_42_93_12, 2...
11453     [[11453, 0, 2020-07-23 20:40:08, 1_42_93_12, 2...
102033    [[102033, 0, 2020-07-23 20:40:08, 1_42_93_12, ...
167384    [[167384, 0, 2020-07-23 20:40:08, 1_42_93_12, ...
182576    [[182576, 0, 2020-07-23 20:40:08, 1_42_93_12, ...
Length: 68, dtype: object

In [33]:
unique_users = data['user_identifier'].unique()
opt = -np.inf
best_user = None
for user in unique_users:
    l = len(grouped_by_user[user])
    if l > opt:
        opt = l
        best_user = user

In [34]:
opt

68

In [32]:
grouped_by_user

user_identifier  session_id
0                182576        [[182576, 0, 2020-09-22 14:38:14, 1_42_93_5, 2...
                 167384        [[167384, 0, 2020-09-22 14:38:14, 1_42_93_5, 2...
                 102033        [[102033, 0, 2020-09-22 14:38:14, 1_42_93_5, 2...
                 11453         [[11453, 0, 2020-09-22 14:38:14, 1_42_93_5, 2_...
                 90303         [[90303, 0, 2020-09-22 14:38:14, 1_42_93_5, 2_...
                                                     ...                        
871095           1549128       [[1549128, 871095, 2020-06-01 17:22:28, 1_42_9...
871099           1590753       [[1590753, 871099, 2020-06-01 00:00:00, 1_22_1...
871110           1559831       [[1559831, 871110, 2020-06-01 00:00:00, 1_22_1...
871111           1586623       [[1586623, 871111, 2020-06-05 17:51:07, 1_8, N...
871126           1606224       [[1606224, 871126, 2020-06-01 00:00:00, 1_22_1...
Length: 347837, dtype: object

In [43]:
min(data['event_timestamp'])

Timestamp('2020-04-14 19:16:49')

In [44]:
baseline_date = pd.Timestamp(year=2020, month=4, day=14)

In [45]:
data['event_timestamp'] = (data['event_timestamp'] - baseline_date).apply(lambda ts: ts.delta) * 1e-9 

In [47]:
event_categories = ['event_category_idx_lv2', 'event_category_idx_lv3', 'event_category_idx_lv4']
for col in event_categories:
    data[col] = data[col].apply(lambda s: int(s[2:]) if isinstance(s, str) else -1)
    

In [46]:
encoder = LabelEncoder()
data['event_category_idx_lv1'] = encoder.fit_transform(data['event_category_idx_lv1'])

In [100]:
training_data = data[data['session_id'].isin(train['session_id'])]

In [101]:
testing_data = data[data['session_id'].isin(test['session_id'])]

In [20]:
seq_train, seq_test_and_val = train_test_split(train, train_size=0.7, random_state=42)
seq_test, seq_val = train_test_split(seq_test_and_val, train_size=0.5, random_state=42)

train_df = training_data[training_data['session_id'].isin(seq_train['session_id'])]
test_df = training_data[training_data['session_id'].isin(seq_test['session_id'])]
val_df = training_data[training_data['session_id'].isin(seq_val['session_id'])]

In [25]:
train_df

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
1041402,13,0.017192,0.709845,0.545455,0.219780,-1.000000,-1.000000,0.164835
1041401,13,0.017192,0.711627,0.363636,0.019780,0.168831,-1.000000,0.164835
1041400,13,0.017192,0.711782,0.545455,0.054945,-1.000000,-1.000000,0.164835
1041399,13,0.017192,0.712119,0.545455,0.210989,-1.000000,-1.000000,0.164835
1041398,13,0.017192,0.712531,0.000000,0.008791,-1.000000,-1.000000,0.153846
...,...,...,...,...,...,...,...,...
4644870,1606146,0.385380,0.171370,0.090909,0.000000,0.000000,0.000000,0.175824
5674132,1606153,0.990610,0.199982,0.181818,0.030769,0.025974,-1.000000,0.032967
5648728,1606158,0.933034,0.026270,0.272727,0.013187,-1.000000,-1.000000,0.164835
1154161,1606177,0.020426,0.083601,0.363636,0.019780,0.077922,-1.000000,0.142857


In [26]:
training_data

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
4134595,6,237376,7980451.0,0,3.0,,,8
4134594,6,237376,8021332.0,0,2.0,,,7
4134593,6,237376,8021360.0,0,4.0,,,7
4134592,6,237376,8021371.0,0,4.0,,,7
4134591,6,237376,8021415.0,0,3.0,,,7
...,...,...,...,...,...,...,...,...
1154161,1606177,17793,4902655.0,4,9.0,6.0,,13
5670922,1606199,853582,4147200.0,2,1.0,2.0,,5
5532706,1606217,701497,5657100.0,3,6.0,,,1
1431141,1606226,25526,4454640.0,1,0.0,0.0,1.0,1


In [27]:
n_sequences = data['session_id'].nunique()
n_sequences

347837

In [34]:
def reshape_sequences(dataset):
    n_sequences = dataset['session_id'].nunique()
    max_length = max(dataset['session_id'].value_counts())
    sequences = -np.ones((n_sequences, max_length, 7))
    
    i = 0
    grouped = dataset.groupby('session_id').apply(np.array)
    for seq in grouped:
        l = len(seq)
        sequences[i,:l] = seq[:,1:]
        i += 1
    return sequences

In [35]:
X_train = reshape_sequences(train_df)

In [36]:
X_test = reshape_sequences(test_df)

In [37]:
X_val = reshape_sequences(val_df)

In [46]:
# training = reshape_sequences(training_data, train)

In [38]:
testing = reshape_sequences(testing_data)

In [39]:
y_train = (seq_train['label'] - 1).to_numpy()
y_test = (seq_test['label'] - 1).to_numpy()
y_val = (seq_val['label'] - 1).to_numpy()

In [157]:
# labels = (train['label'].to_numpy() - 1)  # REMEMBER TO DO +1 ON PREDICTION WHEN SUBMITTING!

In [51]:
# X_train, X_test_and_val = train_test_split(training, train_size=0.7, random_state=42)
# y_train, y_test_and_val = train_test_split(labels, train_size=0.7, random_state=42)

In [55]:
# X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(training, labels, train_size=0.7, random_state=42)
# X_test, X_val, y_test, y_val = train_test_split(X_test_and_val, y_test_and_val, train_size=0.5, random_state=42)

In [39]:
# scaler = Standa()
# X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
# X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)
# X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)

In [65]:
model = Sequential()

#model.add(Embedding(input_dim=uniques,
#              output_dim=64,
#              trainable=False,
#              mask_zero=True))

#model.add(Conv1D(10, 2, padding='same', activation='relu', input_shape=(30, 7)))
#model.add(TimeDistributed(Flatten()))
model.add(GRU(120, activation='softsign', return_sequences=True, dropout=0.1))
model.add(GRU(120, activation='softsign', return_sequences=False, dropout=0.1, kernel_initializer='zeros'))
model.add(Dense(72, activation='relu', kernel_initializer='random_normal'))
model.add(Dropout(0.1))
model.add(Dense(4, activation='softmax'))

opt = Adam(learning_rate=1e-1)
model.compile(
    optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10)

In [202]:
model.evaluate(X_test, y_test)



[1.19654573027053, 0.467368]

1