In [1]:
from pathlib import Path

from tqdm.auto import tqdm
import pandas as pd
import polars as pl 
import numpy as np
import pyarrow.parquet as pq
import scipy

import feature_utils

In [36]:
from importlib import reload
reload(feature_utils)

<module 'feature_utils' from '/home/andy/mts_2023/feature_utils.py'>

In [3]:
LOCAL_DATA_ROOT = Path('data')
SRC_DIR = LOCAL_DATA_ROOT / 'competition_data_final_pqt'
TGT_DIR = LOCAL_DATA_ROOT / 'data_converted'

In [4]:
CAT_FEATURES = [
    'region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'url_host', 
    'cpe_type_cd', 'cpe_model_os_type', 'part_of_day'
]

In [5]:
cat_feature_enumerators = {fn: feature_utils.CatNumerator() for fn in CAT_FEATURES}
price_scaler = feature_utils.ZeroOneScaler()

In [6]:
for part_path in tqdm(list(SRC_DIR.glob('*.parquet'))):
    part = pd.read_parquet(part_path)
    
    for feature_name, feature_transformer in cat_feature_enumerators.items():
        feature_transformer.update(part[feature_name].unique())
    price_scaler.update(part['price'].unique())
    del part

  0%|          | 0/10 [00:00<?, ?it/s]

In [8]:
for n, t in cat_feature_enumerators.items():
    print(f'{n} {len(t.cats)}')

region_name 81
city_name 985
cpe_manufacturer_name 37
cpe_model_name 599
url_host 199683
cpe_type_cd 4
cpe_model_os_type 3
part_of_day 4


In [20]:
features_root = Path('feature_transformers')

In [23]:
# Saving, may overwrite!
for feature_name, feature_transformer in cat_feature_enumerators.items():
    feature_transformer.save((features_root / feature_name).with_suffix('.json'))

price_scaler.save(features_root / 'price.json')

In [37]:
# Loading
cat_feature_enumerators = {
    fn: feature_utils.CatNumerator.load((features_root / fn).with_suffix('.json')) 
    for fn in CAT_FEATURES
}
price_scaler = feature_utils.ZeroOneScaler.load(features_root / 'price.json')

In [18]:
feature_dtypes = {}
for feature_name, feature_transformer in cat_feature_enumerators.items():
    if len(feature_transformer) < 256:
        feature_dtypes[feature_name] = np.uint8
    elif len(feature_transformer) < 2 ** 16:
        feature_dtypes[feature_name] = np.uint16
    else:
        feature_dtypes[feature_name] = np.uint32

In [19]:
feature_dtypes

{'region_name': numpy.uint8,
 'city_name': numpy.uint16,
 'cpe_manufacturer_name': numpy.uint8,
 'cpe_model_name': numpy.uint16,
 'url_host': numpy.uint32,
 'cpe_type_cd': numpy.uint8,
 'cpe_model_os_type': numpy.uint8,
 'part_of_day': numpy.uint8}

In [80]:
%mkdir {TGT_DIR}
for part_path in tqdm(list(SRC_DIR.glob('*.parquet'))):
    part = pd.read_parquet(part_path)

    for feature_name, feature_transformer in cat_feature_enumerators.items():
        part[feature_name] = feature_transformer.transform(part[feature_name].values).astype(feature_dtypes[feature_name])

    part['price'] = price_scaler.transform(part['price']).astype(np.float32)

    part['date'] = part['date'].apply(lambda d: d.toordinal()).astype(np.uint32)

    part['user_id'] = part['user_id'].astype(np.uint32)
    part['request_cnt'] = part['request_cnt'].astype(np.uint32)

    out_path = (TGT_DIR / part_path.stem).with_suffix('.parquet')
    part.to_parquet(out_path, compression='brotli', index=False)

  0%|          | 0/10 [00:00<?, ?it/s]