# Katabatic Demo Usage

#### Importing Dependenciencies

In [None]:
!pip install -U scikit-learn

In [None]:
!pip install pyitlib

In [None]:
!pip install tensorflow

In [None]:
!pip install pgmpy

In [None]:
!pip install sdv

#### Importing Katabatic

In [3]:
from katabatic.katabatic import Katabatic
import numpy as np


Sample Training Data

In [4]:
from katabatic.models.ganblrpp.utils import get_demo_data
real_data = get_demo_data('adult-raw')
real_data.head()

from sklearn.model_selection import train_test_split
x, y = real_data.values[:,:-1], real_data.values[:,-1]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5)

import numpy as np
def is_numerical(dtype):
    '''
    if the type is one of ['signed-integer', 'unsigned-integer', 'floating point'], we reconginze it as a numerical one.
    
    Reference: https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html#numpy.dtype.kind
    '''
    return dtype.kind in 'iuf'

column_is_numerical = real_data.dtypes.apply(is_numerical).values
numerical = np.argwhere(column_is_numerical).ravel()
numerical

array([ 0,  2,  4, 10, 11, 12], dtype=int64)

Load Model and Train it on Sample Training Data

In [5]:
from katabatic.models.ganblrpp.ganblrpp_adapter import GanblrppAdapter
adapter = GanblrppAdapter(numerical_columns=numerical)
adapter.load_model()
adapter.fit(X_train, y_train, epochs=10)


  from .autonotebook import tqdm as notebook_tqdm


[INFO] Initializing GANBLR++ Model
[INFO] Training GANBLR++ model


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  ls = np.mean(-np.log(np.subtract(1, prob_fake)))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  ls = np.mean(-np.log(np.subtract(1, prob_fake)))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[SUCCESS] Model training completed


Generate New Data from trained model

In [6]:
import pandas as pd
syn_data = adapter.generate(size= 50000)
pd.DataFrame(syn_data, columns=real_data.columns).head(10)

[INFO] Generating data using GANBLR++ model


sampling: 100%|██████████| 6/6 [00:00<00:00, 41.19it/s]

[SUCCESS] Data generation completed





Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,


In [None]:
from katabatic.metrics import tstr_logreg, tstr_mlp, tstr_rf, tstr_xgbt, trtr_jsd, trtr_wd
import pandas as pd

# Ensure syn_data and real_data are numpy arrays
if isinstance(syn_data, pd.DataFrame):
    X_synthetic, y_synthetic = syn_data.iloc[:, :-1].values, syn_data.iloc[:, -1].values
else:
    X_synthetic, y_synthetic = syn_data[:, :-1], syn_data[:, -1]

if isinstance(real_data, pd.DataFrame):
    X_real, y_real = real_data.iloc[:, :-1].values, real_data.iloc[:, -1].values
else:
    X_real, y_real = real_data[:, :-1], real_data[:, -1]

# Convert numpy arrays back to DataFrames and Series
X_synthetic_df = pd.DataFrame(X_synthetic)
y_synthetic_df = pd.Series(y_synthetic)
X_real_df = pd.DataFrame(X_real)
y_real_df = pd.Series(y_real)

# Evaluate using the different models
acc_score_lr  = tstr_logreg.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df)
acc_score_mlp = tstr_mlp.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df)
acc_score_rf  = tstr_rf.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df)
acc_score_xgbt  = tstr_xgbt.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df)
jsd_value = trtr_jsd.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df)
wd_value = trtr_wd.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df)

# Print the results with 4 decimal places
print(f"Accuracy with Logistic Regression: {acc_score_lr:.4f}")
print(f"Accuracy with MLP: {acc_score_mlp:.4f}")
print(f"Accuracy with Random Forest: {acc_score_rf}")
print(f"Accuracy with XgboostTree: {acc_score_xgbt:.4f}")
print(f"Jensen-Shannon Divergence: {jsd_value:.4f}")
print(f"Wasserstein Distance: {wd_value:.4f}")

In [None]:
import numpy as np
import pandas as pd
from katabatic.models.ganblrpp.utils import get_demo_data
from sklearn.model_selection import train_test_split, KFold
from katabatic.models.ganblrpp.ganblrpp_adapter import GanblrppAdapter
from katabatic.metrics import tstr_logreg, tstr_mlp, tstr_rf, tstr_xgbt, trtr_jsd, trtr_wd

# Load and prepare the data
real_data = get_demo_data('adult-raw')
x, y = real_data.values[:, :-1], real_data.values[:, -1]

# Define a function to check if a dtype is numerical
def is_numerical(dtype):
    return dtype.kind in 'iuf'

# Get numerical columns
column_is_numerical = real_data.dtypes.apply(is_numerical).values
numerical = np.argwhere(column_is_numerical).ravel()

# Initialize metrics accumulators
acc_scores_lr = []
acc_scores_mlp = []
acc_scores_rf = []
acc_scores_xgbt = []
jsd_values = []
wd_values = []

# Set up 2-fold cross-validation
kf = KFold(n_splits=2, shuffle=True)

# Repeat the experiment 3 times
for repeat in range(3):
    print(f"Repeat {repeat + 1}")
    
    for train_index, test_index in kf.split(x):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and fit the model
        adapter = GanblrppAdapter(numerical_columns=numerical)
        adapter.load_model()
        adapter.fit(X_train, y_train, epochs=100)
        
        # Generate synthetic data
        syn_data = adapter.generate(size=50000)
        
        # Prepare the synthetic and real datasets for evaluation
        X_synthetic, y_synthetic = syn_data[:, :-1], syn_data[:, -1]
        X_real, y_real = x, y
        
        # Convert numpy arrays to DataFrames and Series
        X_synthetic_df = pd.DataFrame(X_synthetic)
        y_synthetic_df = pd.Series(y_synthetic)
        X_real_df = pd.DataFrame(X_real)
        y_real_df = pd.Series(y_real)
        
        # Evaluate using the different models
        acc_scores_lr.append(tstr_logreg.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df))
        acc_scores_mlp.append(tstr_mlp.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df))
        acc_scores_rf.append(tstr_rf.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df))
        acc_scores_xgbt.append(tstr_xgbt.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df))
        jsd_values.append(trtr_jsd.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df))
        wd_values.append(trtr_wd.evaluate(X_synthetic_df, y_synthetic_df, X_real_df, y_real_df))

# Calculate average results
avg_acc_score_lr = np.mean(acc_scores_lr)
avg_acc_score_mlp = np.mean(acc_scores_mlp)
avg_acc_score_rf = np.mean(acc_scores_rf)
avg_acc_score_xgbt = np.mean(acc_scores_xgbt)
avg_jsd_value = np.mean(jsd_values)
avg_wd_value = np.mean(wd_values)

# Print the averaged results with 4 decimal places
print(f"Average Accuracy with Logistic Regression: {avg_acc_score_lr:.4f}")
print(f"Average Accuracy with MLP: {avg_acc_score_mlp:.4f}")
print(f"Average Accuracy with Random Forest: {avg_acc_score_rf:.4f}")
print(f"Average Accuracy with XgboostTree: {avg_acc_score_xgbt:.4f}")
print(f"Average Jensen-Shannon Divergence: {avg_jsd_value:.4f}")
print(f"Average Wasserstein Distance: {avg_wd_value:.4f}")


In [None]:
# Future import statement ideas

import katabatic as kb
from katabatic.models import meg
from katabatic.models import ganblr
from katabatic.evaluate import eval_method1
from katabatic.utils.preprocessing import data_processing_method1  # good place to store preprocessing utilities