In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import gc
pd.set_option("display.max_columns", None)
data_path = Path('.')

In [2]:
test_df = pd.DataFrame()
chunks = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=100_000)

for chunk in chunks:
    event_dict = {'session': [],'aid': [],'ts': [],'type': []}
    for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
        for event in events:
            event_dict['session'].append(session)
            event_dict['aid'].append(event['aid'])
            event_dict['ts'].append(event['ts'])
            event_dict['type'].append(event['type'])
    chunk_session = pd.DataFrame(event_dict)
    test_df = pd.concat([test_df, chunk_session])
            
test_df = test_df.reset_index(drop=True)
test_df

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000278,clicks
1,12899780,1142000,1661724000378,clicks
2,12899780,582732,1661724058352,clicks
3,12899780,973453,1661724109199,clicks
4,12899780,736515,1661724136868,clicks
...,...,...,...,...
6928118,14571577,1141710,1662328774770,clicks
6928119,14571578,519105,1662328775009,clicks
6928120,14571579,739876,1662328775605,clicks
6928121,14571580,202353,1662328781067,clicks


In [3]:
%%time
test_df["minutes"] = test_df[["session", "ts"]].groupby("session").diff(-1)*(-1/1000/60)
test_df = test_df.sort_values(['minutes'],ascending=False)

test_action_df = test_df.copy()
test_action_df.aid = ' ' + test_df.aid.astype('str')
test_action_df = test_action_df.groupby(['session','type'])['aid'].sum().reset_index()
test_action_df

CPU times: user 4min 15s, sys: 2.71 s, total: 4min 17s
Wall time: 4min 17s


Unnamed: 0,session,type,aid
0,12899779,clicks,59625
1,12899780,clicks,1142000 582732 973453 736515 1142000
2,12899781,carts,199008
3,12899781,clicks,199008 194067 199008 199008 199008 199008 573...
4,12899782,carts,1494780 834354 975116 127404 413962 595994 13...
...,...,...,...
1948868,14571577,clicks,1141710
1948869,14571578,clicks,519105
1948870,14571579,clicks,739876
1948871,14571580,clicks,202353


In [15]:
from tqdm import tqdm 
max = 0
for i in tqdm(test_action_df.aid):    
    if max < len(i.split()):
        max = len(i.split())
    

100%|██████████| 1948873/1948873 [00:00<00:00, 2141448.48it/s]


In [16]:
max

433

In [10]:
next_orders_df = pd.DataFrame(test_action_df.loc[(test_action_df["type"] == 'carts'), ])
next_orders_df['type'] = 'orders'
next_orders_df

Unnamed: 0,session,type,aid
2,12899781,orders,199008
4,12899782,orders,1494780 834354 975116 127404 413962 595994 13...
10,12899786,orders,955252
12,12899787,orders,1682750 1682750 1682750
16,12899790,orders,1830166 1219653
...,...,...,...
1948716,14571430,orders,903014
1948730,14571443,orders,942326
1948774,14571486,orders,350578
1948788,14571499,orders,1132907


In [5]:
next_carts_df = pd.DataFrame(test_action_df.loc[(test_action_df["type"] == 'clicks'), ])
next_carts_df['type'] = 'carts'
next_carts_df

Unnamed: 0,session,type,aid
0,12899779,carts,59625
1,12899780,carts,1142000 582732 973453 736515 1142000
3,12899781,carts,199008 194067 199008 199008 199008 199008 573...
5,12899782,carts,603159 779477 1299062 602722 413962 975116 16...
7,12899783,carts,607638 1729553 255297 300127 1754419 1216820 ...
...,...,...,...
1948868,14571577,carts,1141710
1948869,14571578,carts,519105
1948870,14571579,carts,739876
1948871,14571580,carts,202353


In [6]:
next_clicks_df = pd.DataFrame(test_action_df.loc[(test_action_df["type"] == 'clicks'), ]).copy()

In [7]:
recommend_df = pd.concat([next_orders_df, next_carts_df, next_clicks_df], axis =0)
recommend_df["session_type"] = recommend_df["session"].astype('str') + "_" + recommend_df["type"] 
recommend_df

Unnamed: 0,session,type,aid,session_type
2,12899781,orders,199008,12899781_orders
4,12899782,orders,1494780 834354 975116 127404 413962 595994 13...,12899782_orders
10,12899786,orders,955252,12899786_orders
12,12899787,orders,1682750 1682750 1682750,12899787_orders
16,12899790,orders,1830166 1219653,12899790_orders
...,...,...,...,...
1948868,14571577,clicks,1141710,14571577_clicks
1948869,14571578,clicks,519105,14571578_clicks
1948870,14571579,clicks,739876,14571579_clicks
1948871,14571580,clicks,202353,14571580_clicks


In [8]:
sample_sub = pd.read_csv('./sample_submission.csv')
sample_sub

Unnamed: 0,session_type,labels
0,12899779_clicks,129004 126836 118524
1,12899779_carts,129004 126836 118524
2,12899779_orders,129004 126836 118524
3,12899780_clicks,129004 126836 118524
4,12899780_carts,129004 126836 118524
...,...,...
5015404,14571580_carts,129004 126836 118524
5015405,14571580_orders,129004 126836 118524
5015406,14571581_clicks,129004 126836 118524
5015407,14571581_carts,129004 126836 118524


In [18]:
sample_sub = pd.merge(sample_sub, recommend_df[["session_type","aid"]], on = "session_type", how ="left")
sample_sub

Unnamed: 0,session_type,labels,aid_x,aid_y,aid
0,12899779_clicks,129004 126836 118524,59625,59625,59625
1,12899779_carts,129004 126836 118524,59625,59625,59625
2,12899779_orders,129004 126836 118524,,,
3,12899780_clicks,129004 126836 118524,1142000 582732 973453 736515 1142000,1142000 582732 973453 736515 1142000,1142000 582732 973453 736515 1142000
4,12899780_carts,129004 126836 118524,1142000 582732 973453 736515 1142000,1142000 582732 973453 736515 1142000,1142000 582732 973453 736515 1142000
...,...,...,...,...,...
5015404,14571580_carts,129004 126836 118524,202353,202353,202353
5015405,14571580_orders,129004 126836 118524,,,
5015406,14571581_clicks,129004 126836 118524,1100210,1100210,1100210
5015407,14571581_carts,129004 126836 118524,1100210,1100210,1100210


In [19]:


sample_sub['next'] = sample_sub['aid'] + best_sold_list
sample_sub['next'].fillna(best_sold_list, inplace = True)
sample_sub['next'] = sample_sub['next'].str.strip()
sample_sub = sample_sub.drop(["labels", "aid"], axis = 1)
sample_sub.columns = ("session_type", "labels")
sample_sub

NameError: name 'best_sold_list' is not defined