In [1]:
######################################################
# Libraries

import pandas as pd
import numpy as np

from tqdm import tqdm

######################################################

In [2]:
######################################################
# Global variables and paths

path_data = '../0_Data/'

######################################################

In [3]:
######################################################

def get_cycled_feature_value_sin(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    return value_sin

def get_cycled_feature_value_cos(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_cos = np.cos(2*np.pi*value_scaled)
    return value_cos

    
def remove_duplicates(df, verbose=False):
    if verbose:
        print('Removing duplicates....')
    df = df.sort_values(['session', 'ts'], ascending=[True, True]).reset_index(drop=True)
    df["session_type_aid_lag_1"] = df.groupby(['session', 'aid'])['type'].shift(1)
    df["cumsum"] = (df["type"] != df["session_type_aid_lag_1"]).cumsum()
    # df = df.groupby(['session', 'aid', 'cumsum']).agg({'type' : 'max', 'ts' : 'max'}).reset_index()
    df = df.drop_duplicates(['session', 'aid', 'cumsum'], keep='last').reset_index(drop=True)
    df = df.drop(['session_type_aid_lag_1', 'cumsum'], axis=1)
    return df

def build_time_encoding(df, verbose=False):
    if verbose:
        print('Building time encoding....')
    df['month'] = pd.to_datetime(df['ts'], unit='ms').dt.month+1
    df['day_of_year'] = pd.to_datetime(df['ts'], unit='ms').dt.day_of_year+1
    df['hour'] = pd.to_datetime(df['ts'], unit='ms').dt.hour+1
    df['minute'] = pd.to_datetime(df['ts'], unit='ms').dt.minute+1
    df['second'] = pd.to_datetime(df['ts'], unit='ms').dt.second+1

    df['month_sin'] = get_cycled_feature_value_sin(df['month'], 12)
    df['month_cos'] = get_cycled_feature_value_cos(df['month'], 12)

    df['day_of_year_sin'] = get_cycled_feature_value_sin(df['day_of_year'], 365)
    df['day_of_year_cos'] = get_cycled_feature_value_cos(df['day_of_year'], 365)

    df['hour_sin'] = get_cycled_feature_value_sin(df['hour'], 24)
    df['hour_cos'] = get_cycled_feature_value_cos(df['hour'], 24)

    df['minute_sin'] = get_cycled_feature_value_sin(df['minute'], 60)
    df['minute_cos'] = get_cycled_feature_value_cos(df['minute'], 60)

    df['second_sin'] = get_cycled_feature_value_sin(df['second'], 60)
    df['second_cos'] = get_cycled_feature_value_cos(df['second'], 60)

    df = df.drop(['month', 'day_of_year', 'hour', 'minute', 'second'], axis=1)

    list_cols_time_encoding = ['month_sin', 'month_cos', 
                                'day_of_year_sin', 'day_of_year_cos',
                                'hour_sin', 'hour_cos',
                                'minute_sin', 'minute_cos',
                                'second_sin', 'second_cos']
    df['time_encoding'] = df[list_cols_time_encoding].values.tolist()
    df = df.drop(list_cols_time_encoding, axis=1)
    return df


def filter_size_sessions(df, min_size_session, verbose=False):
    if verbose:
        print('Filtering size sessions....')
    df_all = df.groupby('session').size().to_frame('size_session').reset_index()
    df_all = df_all[df_all['size_session'] >= min_size_session].reset_index(drop=True)
    df = df.merge(df_all, how='inner', on='session')
    return df


def build_seqs(df, sort_values=['ts'], verbose=False):
    if verbose:
        print('Building seqs....')
    df = df.sort_values(sort_values, ascending=True).reset_index(drop=True)
    df['datetime'] = pd.to_datetime(df['ts'], unit='ms')
    ###
    list_cols = ['aid', 'type', 'time_encoding']
    df_new = pd.DataFrame()
    for i, col in enumerate(list_cols):
        df_tmp = df.groupby('session')[col].apply(list).reset_index()
        if i==0:
            df_new = df_tmp
        else:
            df_new = df_new.merge(df_tmp, on='session', how='inner')
    df_new = df_new.merge(df.groupby('session').agg(min_datetime=('datetime', 'min'), 
                                                    max_datetime=('datetime', 'max')).reset_index(), on='session', how='inner')
    df_new['size_session'] = df_new['aid'].apply(lambda x : len(x))
    df_new['time_encoding'] = df_new['time_encoding'].apply(lambda x : np.array(x).reshape(-1))
    return df_new


def process_chunks(df_chunks, num_chunks=-1, dict_map_type=None, sort_values=None, min_size_session=1, verbose=False):
    df_all_session = pd.DataFrame()
    for iter_chunk, df_chunk in enumerate(tqdm(df_chunks)):
        dict_events = {
            'session': [],
            'aid': [],
            'ts': [],
            'type': [],
        }
        if num_chunks!=-1 and iter_chunk >= num_chunks:
            break

        for session, events in zip(df_chunk['session'].tolist(), df_chunk['events'].tolist()):
            for event in events:
                dict_events['session'].append(session)
                dict_events['aid'].append(event['aid']+1)
                dict_events['ts'].append(event['ts'])
                dict_events['type'].append(dict_map_type[event['type']])
        df_chunk_session = pd.DataFrame(dict_events)
        df_chunk_session = remove_duplicates(df_chunk_session, verbose=verbose)
        df_chunk_session = filter_size_sessions(df_chunk_session, min_size_session, verbose=verbose)
        # df_chunk_session_debug = df_chunk_session.copy()
        df_chunk_session = build_time_encoding(df_chunk_session, verbose=verbose)
        df_chunk_session = build_seqs(df_chunk_session, sort_values=sort_values, verbose=verbose)
        df_all_session = pd.concat([df_all_session, df_chunk_session])

    df_all_session = df_all_session.reset_index(drop=True)
    df_all_session['min_datetime'] = df_all_session['min_datetime'].astype(str)
    df_all_session['max_datetime'] = df_all_session['max_datetime'].astype(str)
    return df_all_session

######################################################


In [4]:
######################################################
# Load Data

dict_map_type = {
    'clicks' : 1,
    'carts' : 2,
    'orders' : 3
}

df_train_chunked = pd.read_json(path_data + 'train.jsonl', lines=True, chunksize=50_000)
df_test_chunked = pd.read_json(path_data + 'test.jsonl', lines=True, chunksize=100_000)

df_train_data = process_chunks(df_train_chunked, num_chunks=2, dict_map_type=dict_map_type, sort_values=['ts'], min_size_session=2, verbose=False)
df_test_data = process_chunks(df_test_chunked, num_chunks=-1, dict_map_type=dict_map_type, sort_values=['ts'], min_size_session=1, verbose=False)

print(len(df_train_data), len(df_test_data))
# 2it [00:30, 15.33s/it]
# 17it [02:01,  7.13s/it]
# 96.933 1.671.803


######################################################

2it [00:35, 17.55s/it]
17it [01:46,  6.25s/it]


96933 1671803


In [8]:
from datetime import datetime

In [10]:
ts = 1661724004363

print(datetime.utcfromtimestamp(ts/1000).strftime('%Y-%m-%d %H:%M:%S'))

2022-08-28 22:00:04


: 

In [5]:
df_train_data

Unnamed: 0,session,aid,type,time_encoding,min_datetime,max_datetime,size_session
0,0,"[1517086, 1563460, 1309447, 16247, 1781823, 11...","[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, ...","[-0.8660256655837076, -0.49999954655009055, -0...",2022-07-31 22:00:00.025,2022-08-28 11:09:43.707,262
1,1,"[424965, 1492294, 1492294, 910863, 910863, 149...","[2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, ...","[-0.8660256655837076, -0.49999954655009055, -0...",2022-07-31 22:00:00.025,2022-08-28 19:27:34.992,32
2,2,"[763744, 137493, 504790, 137493, 795864, 37834...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-0.8660256655837076, -0.49999954655009055, -0...",2022-07-31 22:00:00.038,2022-08-28 19:16:55.659,32
3,3,"[1425968, 1425968, 1343407, 1343407, 1425968, ...","[2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-0.8660256655837076, -0.49999954655009055, -0...",2022-07-31 22:00:00.095,2022-08-21 19:21:06.771,203
4,4,"[613620, 298828, 298828, 383829, 255380, 18381...","[1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, ...","[-0.8660256655837076, -0.49999954655009055, -0...",2022-07-31 22:00:00.119,2022-08-27 07:51:21.303,19
...,...,...,...,...,...,...,...
96928,99995,"[1387490, 1783388, 1783388, 87857, 87857, 2342...","[1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1]","[-0.9999999999998629, 5.23598774820354e-07, -0...",2022-08-01 04:05:11.310,2022-08-21 08:16:23.198,14
96929,99996,"[1091949, 1684954, 49478, 1123181, 1335723, 34...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, ...","[-0.9999999999998629, 5.23598774820354e-07, -0...",2022-08-02 03:24:10.064,2022-08-27 19:14:15.343,288
96930,99997,"[366640, 1008166, 1008166, 1008166, 1008166, 1...","[1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 3, 1, 1, 1, 1, ...","[-0.9999999999998629, 5.23598774820354e-07, -0...",2022-08-01 04:05:11.431,2022-08-28 13:51:32.219,94
96931,99998,"[845182, 1414399]","[1, 1]","[-0.9999999999998629, 5.23598774820354e-07, -0...",2022-08-01 16:02:33.728,2022-08-04 07:08:06.295,2


In [6]:
from pandas_tfrecords import pd2tf
pd2tf(df_train_data, '../tfrecords/na_split=train/', max_mb=200)
pd2tf(df_test_data, '../tfrecords/na_split=test/', max_mb=200)