In [11]:
import numpy as np
import pandas as pd
import json
from typing import List, Dict, Callable


In [19]:
with open('features.json') as f:
    feature_list = json.loads(f.read())
    features = {f['key']: f for f in feature_list}


In [37]:
raw_dataset = pd.read_pickle("./ad.pkl")
dataset = raw_dataset.copy()


In [38]:
dataset = dataset.dropna()


In [39]:
def convert_numeric(feature: Dict) -> Callable[[str], int]:
    def convert(text: str):
        unit = feature.get('unit', "")
        return np.int(text.replace(unit, "").replace(" ", ""))
    return convert


for key, feature in features.items():
    if feature['type'] == 'numeric':
        dataset[key] = dataset[key].map(
            convert_numeric(feature))


In [40]:
dataset['PRICE'] = dataset['PRICE'].map(lambda p: p//1_000_000)


In [41]:
def validate_data(dataset: pd.DataFrame, feature: Dict) -> None:
    key, type = feature['key'], feature['type']

    def feature_filter_generator():
        if type == 'numeric':
            return lambda v: feature['min'] < v < feature['max']
        elif type == 'enum':
            return lambda v: v in feature['values']
    feature_filter = feature_filter_generator()
    return dataset[dataset[key].map(feature_filter)]

for feature in features.values():
    dataset = validate_data(dataset = dataset, feature = feature)

In [46]:
enum_feature_columns = [f['key'] for f in features.values() if f['type'] == 'enum']
dataset = pd.get_dummies(dataset, columns=enum_feature_columns, prefix='', prefix_sep='')