In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, QuantileTransformer


class SmartNormalizer:
    def __init__(self, two_col=False):
        self.quantile_transformer = None
        self.encoder = OrdinalEncoder()
        self.two_col = two_col
        self.is_numeric = None

    def fit(self, data):
        assert type(data) == np.ndarray
        data = data.copy().ravel()
        self.is_numeric = data.dtype != np.dtype("O")
        if self.is_numeric:
            good_mask = np.isfinite(data)
            data = data[good_mask]
            if not len(data):
                data = np.zeros(1)
            data = data.reshape(-1, 1)
            self.quantile_transformer = QuantileTransformer(
                output_distribution="normal", n_quantiles=min(100, len(data))
            )
            self.quantile_transformer.fit(data)
        else:
            data = data.reshape(-1, 1)
            self.encoder.fit(data)

    def transform(self, data):
        data = data.copy().ravel()
        if self.is_numeric:
            good_mask = np.isfinite(data)
            data[~good_mask] = 0
            data = data.reshape(-1, 1)
            first_col = self.quantile_transformer.transform(data)
        else:
            good_mask = data != None
            data = data.reshape(-1, 1)
            first_col = self.encoder.transform(data)
        if self.two_col:
            second_col = good_mask.astype(np.float32).reshape(-1, 1)
            return np.concatenate([first_col, second_col], axis=1)
        else:
            return first_col.reshape(-1, 1)


class SmartNormalizerDF:
    def __init__(self, two_col=False):
        self.two_col = two_col
        self.normalizers = {}

    def fit(self, df):
        for col in df.columns:
            self.normalizers[col] = SmartNormalizer(self.two_col)
            self.normalizers[col].fit(df[col].values)

    def transform(self, df):
        df = df.copy()
        for i, col in enumerate(df.columns):
            if self.two_col:
                df[[col, col + "_ok"]] = self.normalizers[col].transform(df[col].values)
            else:
                df[col] = self.normalizers[col].transform(df[col].values)
            if i % 10 == 0:
                df = df.copy()
        return df


In [29]:
df = pd.read_csv("train.csv").drop(columns=["message_timestamp", "physical_part_id"])
df_train = df.sample(frac=0.8)
df_valid = df.drop(df_train.index)

smart_normalizer = SmartNormalizerDF(two_col=True)
smart_normalizer.fit(df_train)
df_train = smart_normalizer.transform(df_train)
df_valid = smart_normalizer.transform(df_valid)
