# Exact next click-to-click co-visitation matrix and some per aid calculations for clicks
This notebook prepares click-to-click co-visitation matrix. It is used to and calculate features for the clicks model.
Additionally, some per aid counts are performed in this notebook, including:
* median time users view aid
* average per day clicks per aid
* return rate, counting how often users return for a new click or other actions with the same aid.

These counts are used to engineer features for all the models.

## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
from humanize import naturalsize

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
class CalculateCovisitationMatrixExactNext(otto_common.CalculateCovisitationMatrix):
    '''
    Child class with logic specific to this type of covisitation matrix.
    It counts all aids that were clicked exact after some event with another aid.
    Weight value is calculated in a way that makes later events more important.
    '''
    
    def __init__(self, *args, timebase, **kwargs):
        self.timebase = timebase
        super().__init__(*args, **kwargs)

    def calculate_weights(self, df_i):
        df_i = df_i.sort_values(['session','ts'],ascending=[True,True])
        df_i = df_i.reset_index(drop=True)
        df_i['n'] = df_i.groupby('session').cumcount()
        df_i2 = df_i.copy()
        df_i2['n'] = df_i2['n'] - 1
        df_j = df_i.merge(df_i2,how='inner',on=['session', 'n'])
        df_j = df_j[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y']) 
        df_j['wgt'] = (1 + 3*(df_j.ts_x - self.timebase)/3024000)   
        df_j = df_j[['aid_x','aid_y','wgt']]
        del df_i, df_i2
        gc.collect()
        df_j = df_j.loc[df_j['aid_x'].isin(self.aid_list)]
        df_j.wgt = df_j.wgt.astype('float32')
        df_j = self.groupby_reset_and_reduce(df_j)
        return df_j

In [3]:
# This function normalizes matrix before calculating features.
# Normalized means that all weights are divided by sum of weights per aid_x.
def normalize_matrice(df, column_name):
    print('start normalizing')
    sum_name = column_name + '_sum'
    df[sum_name] = df.groupby('aid_x')[column_name].transform('sum')
    df[column_name] = 100 * df[column_name]/df[sum_name]
    df = df.drop(sum_name, axis=1)
    df[column_name] = df[column_name].fillna(0).astype(np.float32)
    return df

In [4]:
# Count how often users return to each aid.
def count_returns(train_data):
    n_splits = 4
    df_click_data = pd.read_parquet(train_data)
    df_click_data = df_click_data.loc[df_click_data['type'] == 0]
    for i in range(n_splits):
        df_i = otto_common.divide_df_by_column(df_click_data, n_splits, i, 'session')
        df_i = df_i.groupby(['session','aid']).size()
        df_i.name = 'aid_counts'
        df_i = df_i.reset_index()
        if i == 0:
            df_all = df_i
        else:
            df_all = pd.concat([df_all, df_i], axis=0)
    df_total_aid = df_all.groupby('aid').size()
    df_total_aid.name = 'total'
    df_total_aid = df_total_aid.reset_index()
    df_returns = df_all.loc[df_all['aid_counts'] > 1].groupby('aid').size()
    df_returns.name = 'returns'
    df_returns = df_returns.reset_index()
    del df_all
    gc.collect()
    df_total_aid = pd.merge(df_total_aid, df_returns, how='left', on='aid')
    df_total_aid['returns'] = df_total_aid['returns'].fillna(0)

    df_total_aid['return_rate'] = (3 + df_total_aid['returns'])/(10 + df_total_aid['total'])
    df_total_aid = df_total_aid[['aid', 'return_rate']]
    df_total_aid['return_rate'] = df_total_aid['return_rate'].astype(np.float32)
    df_total_aid['aid'] = df_total_aid['aid'].astype(np.int32)
    return df_total_aid

In [5]:
# Count for how long users wait after clicking each item.
def median_time_clicked(train_data):
    n_splits = 5
    for i in range(n_splits):
        df_i = pd.read_parquet(train_data)
        df_i = otto_common.divide_df_by_column(df_i, n_splits, i, 'session')
        df_i['n'] = df_i.groupby('session').cumcount().astype(np.int16)
        df_i_plus_1 = df_i.copy()
        df_i_plus_1['n'] = df_i_plus_1['n'] - 1
        df_i = pd.merge(df_i, df_i_plus_1, how='inner', on=['session', 'n'])
        del df_i_plus_1
        gc.collect()
        if i == 0:
            df_all = df_i
        else:
            df_all = pd.concat([df_all, df_i])
    del df_i
    gc.collect()
    print('merge_successfull')
    df_all['time_viewed'] = df_all['ts_y'] - df_all['ts_x']
    df_all = df_all[['session', 'aid_x', 'time_viewed', 'type_x']]
    gc.collect()
      
    df_all['time_viewed'] = df_all['time_viewed'].clip(0,180)
    df_all = df_all.loc[df_all['type_x'] == 0]
    gc.collect()
    df_all = df_all.groupby(['session','aid_x']).agg({'time_viewed':'median'})
    df_all = df_all.reset_index()
    df_all = df_all.groupby('aid_x').agg({'time_viewed':'median'})
    df_all = df_all.rename(columns={'time_viewed':'time_viewed_clipped'})
    df_all['time_viewed_clipped'] = df_all['time_viewed_clipped'].astype(np.float32)
    gc.collect()
    df_all = df_all.reset_index()
    
    return df_all

## Co-visitation matrice and other counts for cross-validation dataset

In [6]:
# Make a list of AIDs, that appear in cross-validation sets.
# aid_x is filtered to only include aids from the list, while aid_y are not filtered
# This reduces size of resulting matrix.
trunked_sessions_path = '/kaggle/input/otto-prepare-cv/cv_inputs.parquet'
trunked_sessions_path2 = '/kaggle/input/otto-prepare-cv/cv_inputs2.parquet'
cv_list = otto_common.build_aid_list(trunked_sessions_path, trunked_sessions_path2)

In [7]:
'''
Input parameters and matrix calculation itself.

Unlike other co-visitation matrixes, this one is normalized right after it is calculated.
Normalized means that all weights are divided by sum of weights per aid_x.
'''

click_data_cv = '/kaggle/input/otto-prepare-cv/cv_train.parquet'
n_splits = 80
n_splits_concat = 3
timebase_cv = 1658700000

covisitation_cv = CalculateCovisitationMatrixExactNext(n_splits, n_splits_concat, cv_list, timebase=timebase_cv)
df_matrix_cv = covisitation_cv.generate_covisitation_matrix(click_data_cv)
df_matrix_cv = normalize_matrice(df_matrix_cv, 'wgt')
df_matrix_cv.to_parquet('matrix_exact_next_counts_cv.parquet')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
start normalizing


In [8]:
del df_matrix_cv, cv_list, covisitation_cv
gc.collect()

47

In [9]:
# Calculating number of clicks per aid each day.
# To make it possible to compare data for days, when we know full sessions and days, when we only have trunked sessions, 
# click counts are divided by total number of clicks each day.

df_daily_counts_train = otto_common.create_average_daily_counts(click_data_cv, False)
df_daily_counts_train.to_parquet('daily_counts_train.parquet')

df_daily_counts_cv1 = otto_common.create_average_daily_counts(trunked_sessions_path, True)
df_daily_counts_cv1.to_parquet('daily_counts_cv1.parquet')

df_daily_counts_cv2 = otto_common.create_average_daily_counts(trunked_sessions_path2, True)
df_daily_counts_cv2.to_parquet('daily_counts_cv2.parquet')

In [10]:
del df_daily_counts_train, df_daily_counts_cv1, df_daily_counts_cv2
gc.collect()

21

In [11]:
# Calculating return rate - how often users who viewed this aid view it again.
df_return_rate = count_returns(click_data_cv)
df_return_rate.to_parquet('return_rate_cv.parquet')

In [12]:
del df_return_rate
gc.collect()

47

In [13]:
# Calculating amount of time each aid was viewed.
df_time_viewed_agg = median_time_clicked(click_data_cv)
df_time_viewed_agg.to_parquet('time_viewed_agg_cv.parquet')
print('cv_median_time_ready')

merge_successfull
cv_median_time_ready


In [14]:
del df_time_viewed_agg
gc.collect()

21

## Co-visitation matrice and other counts for test dataset
Repeat exactly the same steps, but for the test dataset.

In [15]:
trunked_sessions_path = '/kaggle/input/otto-prepare-cv/test.parquet'
aid_list =  otto_common.build_aid_list(trunked_sessions_path)

In [16]:
click_data_test = '/kaggle/input/otto-prepare-cv/train_full.parquet'
n_splits = 120
n_splits_concat = 5
timebase_test = 1659304800

covisitation_test = CalculateCovisitationMatrixExactNext(n_splits, n_splits_concat, aid_list, timebase=timebase_test)
df_matrix_test = covisitation_test.generate_covisitation_matrix(click_data_test)
df_matrix_test = normalize_matrice(df_matrix_test, 'wgt')
df_matrix_test.to_parquet('matrix_exact_next_counts_full.parquet')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
start normalizing


In [17]:
del df_matrix_test, aid_list, covisitation_test
gc.collect()

21

In [18]:
df_daily_counts_test_full = otto_common.create_average_daily_counts(click_data_test, False)
df_daily_counts_test_full.to_parquet('daily_counts_test_full.parquet')

df_daily_counts_test_trunked = otto_common.create_average_daily_counts(trunked_sessions_path, True)
df_daily_counts_test_trunked.to_parquet('daily_counts_test_trunked.parquet')

In [19]:
del df_daily_counts_test_full, df_daily_counts_test_trunked
gc.collect()

21

In [20]:
df_return_rate = count_returns(click_data_test)
df_return_rate.to_parquet('return_rate_test.parquet')

In [21]:
del df_return_rate
gc.collect()

21

In [22]:
df_time_viewed_agg = median_time_clicked(click_data_test)
df_time_viewed_agg.to_parquet('time_viewed_agg_test.parquet')

merge_successfull
