In [27]:
"""
23-03-15 : Use whole scripp to convert json to train & test parquets with reduced memory dtypes
23-01-04
- Read jsonl files with pd_read_csv bcuz cudf can't fit string
- Create id2type and type2id dict .pkl files
- Optimize some datatypes to unit8 and uint32
- Save as csv and parquet
""";

In [1]:
import numpy as np
import pandas as pd

In [2]:
types = ['clicks', 'carts', 'orders']
d_type2id = {v: i for i, v in enumerate(types)}
d_id2type = {v: k for k,v in d_type2id.items()}
d_type2id, d_id2type

({'clicks': 0, 'carts': 1, 'orders': 2},
 {0: 'clicks', 1: 'carts', 2: 'orders'})

In [2]:
def jsonl_to_df(fn, d_type2id):
    chunks = pd.read_json(fn, lines=True, chunksize=100_000)
    sessions = []
    aids, tss, types = [], [], []
    for chunk in chunks:
        for row in chunk.itertuples():
            num_events = len(row.events)
            sessions.extend([row.session]*num_events)
            for event in row.events:
                aids.append(event['aid'])
                tss.append(event['ts'])
                types.append(d_type2id[event['type']])
    d = {'session': sessions, 'ts': tss, 'type': types, 'aid': aids, }
    return pd.DataFrame(d)

In [5]:
%%time
# Train df preprocessing
fn = 'data/train.jsonl'
df_train = jsonl_to_df(fn, d_type2id)
df_train.type = df_train.type.astype(np.uint8) # 7.4 GB -> 6.4 GB
print(df_train.shape) 
df_train.head(3)

(216716096, 4)
CPU times: user 36min 33s, sys: 25.6 s, total: 36min 59s
Wall time: 36min 55s


Unnamed: 0,session,ts,type,aid
0,0,1659304800025,0,1517085
1,0,1659304904511,0,1563459
2,0,1659367439426,0,1309446


In [5]:
%%time
# Test df preprocessing
fn = 'data/test.jsonl'
df_test = jsonl_to_df(fn, d_type2id)
df_test.type = df_test.type.astype(np.uint8) 
print(df_test.shape) 
df_test.head(3)

(6928123, 4)
CPU times: user 39.8 s, sys: 1.06 s, total: 40.8 s
Wall time: 40.9 s


Unnamed: 0,session,ts,type,aid
0,12899779,1661724000278,0,59625
1,12899780,1661724000378,0,1142000
2,12899780,1661724058352,0,582732


In [7]:
np.iinfo(np.int8), np.iinfo(np.int16), np.iinfo(np.int32)

(iinfo(min=-128, max=127, dtype=int8),
 iinfo(min=-32768, max=32767, dtype=int16),
 iinfo(min=-2147483648, max=2147483647, dtype=int32))

In [8]:
# Check ts range, and convert to int32
print(df_train.ts.min(), df_train.ts.max())
print(df_test.ts.min(), df_test.ts.max()) # Test set occurs right after train

1659304800025 1661723999984
1661724000278 1662328791563


In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216716096 entries, 0 to 216716095
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int64
 1   ts       int64
 2   type     uint8
 3   aid      int64
dtypes: int64(3), uint8(1)
memory usage: 5.0 GB


In [12]:
if int(max(df_train.ts.max(), df_train.ts.max())/1000) < np.iinfo(np.int32).max:
    df_train.ts = (df_train.ts/1000).astype(np.int32)
    df_test.ts = (df_test.ts/1000).astype(np.int32)

In [13]:
# Check aid range, and convert to int32
print(df_train.aid.min(), df_train.aid.max())
print(df_test.aid.min(), df_test.aid.max()) # Test set occurs right after train

0 1855602
0 1855600


In [14]:
if max(df_train.aid.max(), df_train.ts.max()) < np.iinfo(np.int32).max:
    df_train.aid = df_train.aid.astype(np.int32)
    df_test.aid = df_test.aid.astype(np.int32)

In [15]:
df_train.head(3)

Unnamed: 0,session,ts,type,aid
0,0,1659304800,0,1517085
1,0,1659304904,0,1563459
2,0,1659367439,0,1309446


In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216716096 entries, 0 to 216716095
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int64
 1   ts       int32
 2   type     uint8
 3   aid      int32
dtypes: int32(2), int64(1), uint8(1)
memory usage: 3.4 GB


In [17]:
df_test.head(3)

Unnamed: 0,session,ts,type,aid
0,12899779,1661724000,0,59625
1,12899780,1661724000,0,1142000
2,12899780,1661724058,0,582732


In [18]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6928123 entries, 0 to 6928122
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int64
 1   ts       int32
 2   type     uint8
 3   aid      int32
dtypes: int32(2), int64(1), uint8(1)
memory usage: 112.3 MB


In [19]:
df_test.to_parquet('data/230313_df_test.pqt', index=False)
df_train.to_parquet('data/230313_df_train.pqt', index=False)

pd.to_pickle(d_type2id, 'data/d_type2id.pkl', protocol=4)
pd.to_pickle(d_id2type, 'data/d_id2type.pkl', protocol=4)