In [1]:
""" Split validation data into 5/10/25/50% pickle files for faster loading """
import pandas as pd
import os

In [2]:
# Speedup - Save validation data as pkl files (loading vs preprocessing each time)
df_val = pd.read_parquet('data/230313_val.pqt')
print(df_val.shape, len(df_val.session.unique()))
df_val.head(3)

(7684122, 4) 1801251


Unnamed: 0,session,ts,type,aid
0,11098528,1661119200,0,11830
1,11098528,1661119417,0,1679529
2,11098528,1661119474,0,92401


In [3]:
%%time
# Preprocess df_val into list aids, and types. Then save as a pickle
grp = df_val.reset_index(drop=True).groupby('session')
aids = grp['aid'].apply(list) # series: session, [aids]
types = grp['type'].apply(list) # series: session, [aid types]

CPU times: user 1min 3s, sys: 652 ms, total: 1min 3s
Wall time: 1min 3s


In [4]:
fdir = 'data/preload'
if not os.path.exists(fdir): os.mkdir(fdir)

date = 230313
for i, frac in enumerate([0.05, 0.1, 0.25, 0.5, 1]):
    print(f'Start {i}')
    sess_cv = aids.index.to_series().sample(frac=frac, random_state=42)
    tmp_aids = aids.loc[sess_cv].sort_index()
    tmp_types = types.loc[sess_cv].sort_index()
    d = {'aids': tmp_aids, 'types': tmp_types}
    pd.to_pickle(d, f'data/preload/{date}_val_aids_types_{i}.pkl', protocol=4)

Start 0
Start 1
Start 2
Start 3
Start 4


In [3]:
%%time
# Preprocess final submission test aids and types
df_train = pd.read_parquet('data/230313_train_2to4.pqt')
df_val = pd.read_parquet('data/230313_df_test.pqt')
# Preprocess df_val into list aids, and types. Then save as a pickle
grp = df_val.reset_index(drop=True).groupby('session')
tmp_aids = grp['aid'].apply(list) # series: session, [aids]
tmp_types = grp['type'].apply(list) # series: session, [aid types]
d = {'aids': tmp_aids, 'types': tmp_types}
pd.to_pickle(d, f'data/preload/test_aids_types.pkl', protocol=4)

CPU times: user 1min 4s, sys: 5.93 s, total: 1min 10s
Wall time: 1min 4s
