In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.init as nn_init
import torch.nn.functional as F
from torch import Tensor

import typing as ty
import math

In [None]:
'''
generate
'''




In [4]:

class Tokenizer(nn.Module):

    def __init__(self, d_numerical, categories, d_token, bias):
        super().__init__()
        if categories is None:
            d_bias = d_numerical
            self.category_offsets = None
            self.category_embeddings = None
        else:
            d_bias = d_numerical + len(categories)
            category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
            self.register_buffer('category_offsets', category_offsets)
            self.category_embeddings = nn.Embedding(sum(categories), d_token)
            nn_init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
            print(f'{self.category_embeddings.weight.shape=}')

        # take [CLS] token into account
        self.weight = nn.Parameter(Tensor(d_numerical + 1, d_token))
        self.bias = nn.Parameter(Tensor(d_bias, d_token)) if bias else None
        # The initialization is inspired by nn.Linear
        nn_init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            nn_init.kaiming_uniform_(self.bias, a=math.sqrt(5))

    @property
    def n_tokens(self):
        return len(self.weight) + (
            0 if self.category_offsets is None else len(self.category_offsets)
        )

    def forward(self, x_num, x_cat):
        x_some = x_num if x_cat is None else x_cat
        assert x_some is not None
        x_num = torch.cat(
            [torch.ones(len(x_some), 1, device=x_some.device)]  # [CLS]
            + ([] if x_num is None else [x_num]),
            dim=1,
        )
    
        x = self.weight[None] * x_num[:, :, None]

        if x_cat is not None:
            x = torch.cat(
                [x, self.category_embeddings(x_cat + self.category_offsets[None])],
                dim=1,
            )
        if self.bias is not None:
            bias = torch.cat(
                [
                    torch.zeros(1, self.bias.shape[1], device=x.device),
                    self.bias,
                ]
            )
            x = x + bias[None]

        return x



In [16]:
cat_train = np.load('/mnt/nas/swethamagesh/tabsyn-fresh/tabsyn/data/adult_cond/X_cat_train.npy', allow_pickle=True)
num_train = np.load('/mnt/nas/swethamagesh/tabsyn-fresh/tabsyn/data/adult_cond/X_num_train.npy')
target = np.load('/mnt/nas/swethamagesh/tabsyn-fresh/tabsyn/data/adult_cond/y_train.npy', allow_pickle=True)

In [17]:
final_cat_train = np.concatenate([target, cat_train], axis=1)

In [13]:
tokenizer = Tokenizer(5, [3, 7, 16, 5, 7, 5, 3, 2, 2], 4, True)

self.category_embeddings.weight.shape=torch.Size([50, 4])


In [19]:
final_cat_train

array([[0, 4, 10, ..., 0, 1, 1],
       [0, 4, 9, ..., 0, 0, 1],
       [0, 1, 9, ..., 0, 0, 1],
       ...,
       [0, 5, 9, ..., 0, 0, 1],
       [0, 4, 10, ..., 0, 0, 1],
       [0, 4, 10, ..., 0, 0, 1]], dtype=object)

In [20]:
torch.from_numpy(final_cat_train.astype(float))

tensor([[ 0.,  4., 10.,  ...,  0.,  1.,  1.],
        [ 0.,  4.,  9.,  ...,  0.,  0.,  1.],
        [ 0.,  1.,  9.,  ...,  0.,  0.,  1.],
        ...,
        [ 0.,  5.,  9.,  ...,  0.,  0.,  1.],
        [ 0.,  4., 10.,  ...,  0.,  0.,  1.],
        [ 0.,  4., 10.,  ...,  0.,  0.,  1.]], dtype=torch.float64)

In [24]:
tokenized_out = tokenizer(torch.from_numpy(num_train), torch.from_numpy(final_cat_train.astype(int)))
tokenized_out.shape

torch.Size([31062, 15, 4])

In [None]:
# 30k 
#  get tokenized output
# (generate constraint for the entire set at once) - [111100011] - c
# constraint for row - 0  & 1 - 0.4  - masked from tokenizd output cxd 


'''
Model load from f'{ckpt_dir}/model.pt'

'''



In [5]:
import sys
sys.path.append('/mnt/nas/swethamagesh/ORD/restsyn/tabsyn/')

In [8]:

from vae.model import Model_VAE, VAE



In [9]:
LR = 1e-3
WD = 0
D_TOKEN = 4
TOKEN_BIAS = True

N_HEAD = 1
FACTOR = 32
NUM_LAYERS = 2
d_numerical = 5
categories = [3, 7, 16, 5, 7, 5, 3, 2, 2]
model_vae = Model_VAE(NUM_LAYERS, d_numerical, categories, D_TOKEN, n_head = N_HEAD, factor = FACTOR, bias = True)
model_vae.load_state_dict(torch.load('/mnt/nas/swethamagesh/tabsyn-fresh/tabsyn/tabsyn/vae/ckpt/adult_cond/model.pt', weights_only=True))


self.category_embeddings.weight.shape=torch.Size([50, 4])


In [49]:
tokenized_out = model_vae.VAE.Tokenizer(torch.from_numpy(num_train), torch.from_numpy(final_cat_train.astype(int)))

In [35]:
''' 
prob of masking = 0.5 to 1 

'''
p = 0.5
q = 1
N = 31062
cols = 15
# sample between p & q for N samples
mask = np.random.uniform(p, q, N)

# generate a 2d binary mask with mask prob for each column (Nxcols)
# given mask value for a given row, mask each column in the tokenized output with that prob
mask_2d = np.random.binomial(1, mask[:, None], (N, cols))
mask_3d = np.stack([mask_2d for i in range(4)], axis=2)
constraint_train = tokenized_out.detach().numpy() * mask_3d



array([[1, 0, 1, ..., 0, 1, 0],
       [1, 0, 0, ..., 1, 0, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 0, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [67]:
final_file = constraint_train.reshape(constraint_train.shape[0], -1)

In [69]:
constraint_train[0]

array([[-1.80650026e-01,  4.39514816e-01, -3.10437918e-01,
        -2.80084968e-01],
       [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00],
       [-6.26748535e+03,  4.29589492e+04, -8.94396250e+04,
         2.23192422e+04],
       [ 3.01283717e-01,  3.67014587e-01, -5.44309579e-02,
         3.80284965e-01],
       [-2.42611572e-01, -1.42419353e-01, -1.85007319e-01,
         3.61240894e-01],
       [ 2.78375769e+00,  3.80989718e+00, -1.02589111e+01,
         4.47270346e+00],
       [ 6.11785054e-01, -1.37224212e-01,  2.17554674e-01,
         2.40170389e-01],
       [-0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00],
       [-0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -0.00000000e+00],
       [ 3.45189750e-01, -3.02160472e-01,  4.57613111e-01,
        -4.28387493e-01],
       [-2.29718685e-01, -2.84911990e-01, -4.06368732e-01,
      

In [68]:
final_file[0]

array([-1.80650026e-01,  4.39514816e-01, -3.10437918e-01, -2.80084968e-01,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -6.26748535e+03,  4.29589492e+04, -8.94396250e+04,  2.23192422e+04,
        3.01283717e-01,  3.67014587e-01, -5.44309579e-02,  3.80284965e-01,
       -2.42611572e-01, -1.42419353e-01, -1.85007319e-01,  3.61240894e-01,
        2.78375769e+00,  3.80989718e+00, -1.02589111e+01,  4.47270346e+00,
        6.11785054e-01, -1.37224212e-01,  2.17554674e-01,  2.40170389e-01,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        3.45189750e-01, -3.02160472e-01,  4.57613111e-01, -4.28387493e-01,
       -2.29718685e-01, -2.84911990e-01, -4.06368732e-01, -2.96661705e-01,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -7.05517471e-01,  

In [70]:
np.save('/mnt/nas/swethamagesh/ORD/restsyn/data/adult_cond/constraints.npy', final_file)