In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import time
import json
import os
import random
from typing import Dict, Iterator, List, Tuple

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from loguru import logger
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold

from nam.config.default import defaults
from nam.types import Config
from nam.utils.args import parse_args
from nam.data.base import NAMDataset

In [3]:
config = defaults()
config

namespace(device='cpu',
          logdir='logs',
          lr=0.01,
          batch_size=1024,
          l2_regularization=0.0,
          output_regularization=0.0,
          decay_rate=0.995,
          dropout=0.5,
          feature_dropout=0.0,
          data_split=1,
          seed=1377,
          num_basis_functions=1000,
          units_multiplier=2,
          cross_val=False,
          max_checkpoints_to_keep=1,
          save_checkpoint_every_n_epochs=10,
          n_models=1,
          num_splits=3,
          fold_num=1,
          activation='exu',
          regression=False,
          debug=False,
          shallow=False,
          use_dnn=False,
          early_stopping_epochs=60,
          n_folds=5)

## GALLUP Data

In [4]:
features_columns = ["income_2", "WP1219", "WP1220", "weo_gdpc_con_ppp"]
targets_column = ["WP16"]
weights_column = ["wgt"]

In [5]:
dataset = NAMDataset(config=config,
                    csv_file='data/GALLUP.csv',
                    features_columns=features_columns,
                    targets_column=targets_column,
                    weights_column=weights_column)
dataset

<nam.data.base.NAMDataset at 0x7fe29a01a4d0>

### K-fold

In [6]:
for train_dl, test_dl in dataset.data_loaders():
    print(next(iter(train_dl)), next(iter(test_dl)))

Fold((1,)), train: 1072647, test: 268162
[tensor([[2.6335e+03, 1.0000e+00, 2.7000e+01, 1.6592e+03],
        [3.3712e+02, 2.0000e+00, 4.7000e+01, 1.0517e+03],
        [1.0756e+04, 2.0000e+00, 1.7000e+01, 1.9147e+03],
        ...,
        [9.6465e+03, 1.0000e+00, 4.8000e+01, 1.3457e+04],
        [1.0512e+04, 1.0000e+00, 2.5000e+01, 1.7622e+04],
        [6.9918e+03, 2.0000e+00, 6.8000e+01, 2.7189e+04]], dtype=torch.float64), tensor([[0.8280],
        [1.5995],
        [0.8854],
        ...,
        [0.1227],
        [0.4122],
        [0.3668]], dtype=torch.float64), tensor([[4],
        [9],
        [2],
        ...,
        [5],
        [5],
        [5]])] [tensor([[2.4244e+04, 1.0000e+00, 7.6000e+01, 2.1498e+04],
        [7.0813e+03, 2.0000e+00, 3.0000e+01, 1.2651e+04],
        [5.5085e+03, 1.0000e+00, 2.2000e+01, 4.3569e+03],
        ...,
        [2.7084e+04, 1.0000e+00, 2.1000e+01, 6.2535e+04],
        [2.9794e+03, 2.0000e+00, 5.2000e+01, 2.1935e+04],
        [7.6719e+03, 1.0000e+00, 

## Housing Data

In [7]:

features_columns = [
    "longitude", "latitude", "housing_median_age", "total_rooms",
    "total_bedrooms", "population", "households", "median_income"
]
targets_column = ["median_house_value"]

In [8]:
dataset = NAMDataset(config=config,
                    csv_file='data/housing.csv',
                    features_columns=features_columns,
                    targets_column=targets_column)
dataset

<nam.data.base.NAMDataset at 0x7fe2b017c250>

In [9]:
for train_dl, test_dl in dataset.data_loaders():
    print(next(iter(train_dl)), next(iter(test_dl)))

Fold((1,)), train: 16512, test: 4128
[tensor([[-1.2199e+02,  3.7820e+01,  2.2000e+01,  ...,  5.7900e+02,
          2.6900e+02,  3.3750e+00],
        [-1.1765e+02,  3.3600e+01,  1.5000e+01,  ...,  2.5290e+03,
          7.6200e+02,  6.4114e+00],
        [-1.1799e+02,  3.3720e+01,  1.4000e+01,  ...,  1.3380e+03,
          4.7500e+02,  3.6280e+00],
        ...,
        [-1.1822e+02,  3.3980e+01,  1.8000e+01,  ...,  1.9130e+03,
          7.0200e+02,  1.2059e+00],
        [-1.1940e+02,  3.6530e+01,  2.8000e+01,  ...,  1.5240e+03,
          4.1200e+02,  2.7500e+00],
        [-1.2148e+02,  3.8490e+01,  2.6000e+01,  ...,  2.4470e+03,
          7.5200e+02,  1.5908e+00]], dtype=torch.float64), tensor([[200000.],
        [278700.],
        [188500.],
        ...,
        [255000.],
        [ 65000.],
        [ 78600.]], dtype=torch.float64)] [tensor([[-1.1828e+02,  3.4090e+01,  4.9000e+01,  ...,  2.8620e+03,
          1.0090e+03,  2.4677e+00],
        [-1.2272e+02,  3.8460e+01,  3.5000e+01,  ..., 