In [None]:
# default_exp utils.processing

In [None]:
# hide
from ipynb_path import *


In [None]:
# export
from counternet.import_essentials import *
from counternet.utils.functional import *
from counternet.utils.dataset import load_adult_income_dataset

Global seed set to 31


In [None]:
%%time
dummy_data = pd.read_csv('assets/data/dummy_data.csv')
adult_data = load_adult_income_dataset('assets/data/adult.data')

CPU times: user 742 ms, sys: 90.2 ms, total: 832 ms
Wall time: 826 ms


## Data Preprossing

In [None]:
# export 
class ABCScaler(ABC):
    @abstractmethod
    def fit(self, X):
        raise NotImplementedError

    @abstractmethod
    def transform(self, X):
        raise NotImplementedError

    @abstractmethod
    def fit_transform(self, X):
        raise NotImplementedError

    @abstractmethod
    def inverse_transform(self, X):
        raise NotImplementedError

In [None]:
# export
class StandardScaler(ABCScaler):
    """rewrite `StandardScaler` object in sci-kit learn in pytorch to eliminate cpu-gpu communication time"""
    mean_, std_ = None, None

    @check_object_input_type
    def fit(self, X):
        self.mean_, self.std_ = torch.mean(X), torch.std(X)
        return self

    @check_object_input_type
    def transform(self, X):
        if (self.mean_ is None) or (self.std_ is None):
            raise NotImplementedError(f'The scaler has not been fitted.')
        return (X - self.mean_) / self.std_

    @check_object_input_type
    def fit_transform(self, X):
        self.mean_, self.std_ = torch.mean(X), torch.std(X)
        return (X - self.mean_) / self.std_

    @check_object_input_type
    def inverse_transform(self, X):
        return X * self.std_ + self.mean_

In [None]:
# export
class MinMaxScaler(ABCScaler):
    """rewrite `MinMaxScaler` object in sci-kit learn in pytorch to eliminate cpu-gpu communication time"""
    min_, max_ = None, None

    @check_object_input_type
    def fit(self, X):
        self.min_, self.max_ = torch.min(X), torch.max(X)
        assert self.min_ != self.max_, f"min(X) == max(X) is not allowed."
        return self

    @check_object_input_type
    def transform(self, X):
        if (self.min_ is None) or (self.max_ is None):
            raise NotImplementedError(f'The scaler has not been fitted.')
        return (X - self.min_) / (self.max_ - self.min_)

    @check_object_input_type
    def fit_transform(self, X):
        self.min_, self.max_ = torch.min(X), torch.max(X)
        assert self.min_ != self.max_, f"min(X) == max(X) is not allowed."
        return (X - self.min_) / (self.max_ - self.min_)

    @check_object_input_type
    def inverse_transform(self, X):
        return X * (self.max_ - self.min_) + self.min_

In [None]:
# export
# TODO need to check
class OneHotEncoder(object):
    categories_ = []
    drop_idx_ = None

    def __init__(self):
        from sklearn.preprocessing import OneHotEncoder
        self.enc = OneHotEncoder(sparse=False)

    def fit(self, X):
        self.enc.fit(X)
        # copy attributes
        self.categories_ = self.enc.categories_
        self.drop_idx_ = self.enc.drop_idx_
        return self

    def transform(self, X):
        return torch.from_numpy(self.enc.transform(X))

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def inverse_transform(self, X):
        assert isinstance(X, torch.Tensor)
        return self.enc.inverse_transform(X.cpu())

In [None]:
scalar = StandardScaler()
_X = adult_data[['age', 'hours_per_week']].to_numpy()
cont = scalar.fit_transform(_X)
assert not False in torch.isclose(torch.from_numpy(_X).float(), scalar.inverse_transform(cont))

In [None]:
scalar = MinMaxScaler()
_X = adult_data[['age', 'hours_per_week']].to_numpy()
cont = scalar.fit_transform(_X)
assert (torch.isclose(torch.from_numpy(_X).float(), scalar.inverse_transform(cont))).all()
assert ((0 <= cont) & (cont <= 1)).all()

scalar = MinMaxScaler()
_X = adult_data[['age', 'hours_per_week']].to_numpy()
scalar.fit(_X)
cont = scalar.transform(_X)
assert (torch.isclose(torch.from_numpy(_X).float(), scalar.inverse_transform(cont))).all()
assert ((0 <= cont) & (cont <= 1)).all()


In [None]:
enc = OneHotEncoder()
_X = adult_data[['workclass','education', 'marital_status', 
            'occupation','race', 'gender']]
cat  = enc.fit_transform(_X)
assert np.array_equal(enc.inverse_transform(cat), _X.to_numpy())

## Pytorch Dataset

In [None]:
# export
class NumpyDataset(TensorDataset):
    def __init__(self, *arrs):
        super().__init__()
        # init tensors
        # small patch: skip continous or discrete array without content
        self.tensors = [torch.tensor(arr).float()
                        for arr in arrs if arr.shape[-1] != 0]
        assert all(self.tensors[0].size(0) == tensor.size(0)
                   for tensor in self.tensors)

    def data_loader(self, batch_size=128, shuffle=True, num_workers=4):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

    def features(self, test=False):
        return tuple(self.tensors[:-1] if not test else self.tensors)

    def target(self, test=False):
        return self.tensors[-1] if not test else None


class PandasDataset(NumpyDataset):
    def __init__(self, df: pd.DataFrame):
        cols = df.columns
        X = df[cols[:-1]].to_numpy()
        y = df[cols[-1]].to_numpy()
        super().__init__(X, y)

In [None]:
x = np.random.normal(50, 15, 100)
y = np.random.normal(50, 15, 100)
df_test = pd.DataFrame({'x': x, 'y': y})
arrs = np.column_stack((x, y))
np_dataset = NumpyDataset(x, y)
pd_dataset = PandasDataset(df_test)

assert (arrs == df_test.to_numpy()).all()
assert len(np_dataset) == len(pd_dataset)
assert (np.column_stack((x, y)) == df_test.to_numpy()).all()

for i in range(len(np_dataset)):
    assert np_dataset[i] == pd_dataset[i]
