# Handling binary data columns

The CISS-VAE model can handle binary and categorical variables in addition to continuous ones. Categorical variables must be represented with binary dummy variables. 

When imputing binary data, we apply a sigmoid activation function at the end to convert to a probability. Because some datasets have both binary and continuous variables, you can input a binary variable mask (boolean vector) to tell the model which variables are binary so it acts accordingly. 

## Example dataset

The example dataset below has both binary and continuous variables in it:

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_rows = 100
prop_mask = 0.3

X = pd.DataFrame({
    "feat1": np.random.choice(np.arange(1, n_rows + 1), size=n_rows, replace=True),
    "feat2": np.random.choice(np.arange(1, 6), size=n_rows, replace=True),
    "feat3": np.random.choice(np.arange(1, 8), size=n_rows, replace=True),
    "feat4": np.random.choice(np.arange(1, 8), size=n_rows, replace=True),
    "feat5": np.random.choice(np.arange(1, 8), size=n_rows, replace=True),
    ## now add some binary features
    "bf1": np.random.binomial(1, 0.25, size=n_rows),
    "bf2": np.random.binomial(1, 0.5, size=n_rows),
    "bf3": np.random.binomial(1, 0.75, size=n_rows),
    "bf4": np.random.binomial(1, 0.33, size=n_rows),
    "bf5": np.random.binomial(1, 0.66, size=n_rows),
})

X_raw = X.copy()

for col in X.columns[1:]:  # skip feat1
    idx = np.where(~X[col].isna())[0]  # indices of non-NA entries
    n_mask = int(np.ceil(len(idx) * prop_mask))
    if n_mask > 0:
        mask_idx = np.random.choice(idx, size=n_mask, replace=False)
        X.loc[mask_idx, col] = np.nan

print(f"X matrix:\n{X}")


## choosing random clusters for now 
clusters = np.random.choice([1, 2, 3], size=n_rows, replace=True)



X matrix:
    feat1  feat2  feat3  feat4  feat5  bf1  bf2  bf3  bf4  bf5
0      52    2.0    6.0    4.0    2.0  NaN  NaN  NaN  NaN  0.0
1      93    1.0    7.0    NaN    2.0  NaN  1.0  1.0  1.0  1.0
2      15    4.0    NaN    4.0    2.0  0.0  0.0  1.0  0.0  NaN
3      72    4.0    3.0    NaN    1.0  NaN  1.0  NaN  1.0  NaN
4      61    NaN    1.0    NaN    1.0  0.0  NaN  NaN  NaN  0.0
..    ...    ...    ...    ...    ...  ...  ...  ...  ...  ...
95     85    NaN    NaN    6.0    2.0  0.0  0.0  1.0  NaN  1.0
96     80    1.0    NaN    7.0    2.0  0.0  NaN  1.0  NaN  1.0
97     82    2.0    4.0    NaN    1.0  0.0  NaN  1.0  NaN  NaN
98     53    4.0    3.0    7.0    7.0  NaN  0.0  0.0  0.0  1.0
99     24    4.0    1.0    3.0    NaN  0.0  1.0  1.0  1.0  0.0

[100 rows x 10 columns]


### Preparing Binary Vector

The binary vector `binary_feature_mask` is of length p for an n x p data matrix and is `True` for binary columns and `False` for continuous columns. 

In [2]:
binary_vector = [False, False, False, False, False, True, True, True, True, True]



## Using `run_cissvae()` with binary matrix

Pass the binary vector to the `run_cissvae()` function using the `binary_feature_mask` argument. Note: even if columns are ignored via `columns_ignore`, those columns must be included in the `binary_feature_mask`. 

In [3]:
import ciss_vae

from ciss_vae.training.run_cissvae import run_cissvae
from ciss_vae.utils.helpers import plot_vae_architecture
print(ciss_vae.__file__)

imputed_data, vae, ds, history = run_cissvae(data = X,
## Dataset params
    columns_ignore = X.columns[0], ## columns to ignore when selecting validation dataset (and clustering if you do not provide clusters). For example, demographic columns with no missingness.
    clusters = clusters,
    print_dataset = False,
    binary_feature_mask = binary_vector,
## VAE model params
    hidden_dims = [150, 120, 60], ## Dimensions of hidden layers, in order. One number per layer. 
    latent_dim = 15, ## Dimensions of latent embedding
    layer_order_enc = ["unshared", "unshared", "unshared"], ## order of shared vs unshared layers for encode (can use u or s instead of unshared, shared)
    layer_order_dec=["shared", "shared",  "shared"],  ## order of shared vs unshared layers for decode
    latent_shared=False, 
    output_shared=False, 
    batch_size = 4000, ## batch size for data loader
    return_model = True, ## if true, outputs imputed dataset and model, otherwise just outputs imputed dataset. Set to true to return model for `plot_vae_architecture`

## Initial Training params
    epochs = 5, ## default 

## Other params
    return_history = True, ## if true, will return training MSE history as pandas dataframe
    return_dataset=True,
    debug = False
)

print(f"The successfully imputed dataset:\n{imputed_data.head}\n\n")

/home/nfs/vaithid1/CISS-VAE/CISS-VAE/src/ciss_vae/__init__.py


IndexError: too many indices for tensor of dimension 1

In [None]:
print(f"History \n{history}")

As always, the vae architecture can be printed. 

In [None]:
plot_vae_architecture(model = vae,
                        title = None, ## Set title of plot
                        ## Colors below are default
                        color_shared = "skyblue", 
                        color_unshared ="lightcoral",
                        color_latent = "gold", # xx fix
                        color_input = "lightgreen",
                        color_output = "lightgreen",
                        figsize=(16, 8),
                        return_fig = False)

## Using Binary Feature Mask with Autotune

To use a `binary_feature_mask` with `autotune()`, pass the use the `binary_feature_mask` parameter when initializing the `ClusterDataset` object. 

In [None]:
from ciss_vae.classes.cluster_dataset import ClusterDataset
from ciss_vae.training.autotune import autotune, SearchSpace
cd = ClusterDataset(
    X, cluster_labels = clusters, binary_feature_mask=binary_vector
)

ss = SearchSpace(
    num_hidden_layers = [1, 2],
    hidden_dims = [6, 16, 32],
    latent_dim=10,
    latent_shared=True,
    output_shared = True,
    lr = 0.01,
    decay_factor=0.999,
    num_epochs = 100,
    num_shared_encode = 1,
    num_shared_decode = 1,
    epochs_per_loop=100,
    reset_lr_refit=False

)
autotune(search_space = ss, train_dataset = cd, optuna_dashboard_db =  "sqlite:///optuna_study_test_binary.db", debug = True)