In [4]:
from katabatic.models.TableGAN import TableGANAdapter, TableGAN, preprocess_data, postprocess_data
import pandas as pd

# Initialize the adapter with a specific privacy setting
tablegan_adapter = TableGANAdapter(type='continuous', privacy_setting='high')

# Load data
data_path = 'data/Adult/train_Adult_cleaned.csv'
labels_path = 'data/Adult/train_Adult_labels.csv'

# Load features
X_train = tablegan_adapter.load_data(data_path)

# Load labels
y_train = pd.read_csv(labels_path, header=None)

# If y_train has multiple columns, assume the first column is the target
if y_train.shape[1] > 1:
    y_train = y_train.iloc[:, 0]

# Print shapes to verify
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

# Fit the model
tablegan_adapter.fit(X_train, y_train, epochs=200, batch_size=64)

# Generate synthetic data
synthetic_data = tablegan_adapter.generate(size=1000)

# Print shape of synthetic data
print("Shape of synthetic data:", synthetic_data.shape)

Loading Data...
Shape of X_train: (2000, 14)
Shape of y_train: (2000, 1)
---FIT TableGAN Model
---Initialise TableGAN Model


  y = column_or_1d(y, warn=True)


Epoch 10/200: [D loss: -1.9100] [G loss: -0.0554] [C loss: 0.3172]
Epoch 20/200: [D loss: -1.3383] [G loss: -0.1641] [C loss: 0.0942]
Epoch 30/200: [D loss: -0.8126] [G loss: -0.3121] [C loss: 0.0424]
Epoch 40/200: [D loss: -0.7984] [G loss: -0.3318] [C loss: 0.0285]
Epoch 50/200: [D loss: -0.6670] [G loss: -0.2688] [C loss: 0.0190]
Epoch 60/200: [D loss: -0.6348] [G loss: 0.0377] [C loss: 0.0142]
Epoch 70/200: [D loss: -0.6111] [G loss: -0.0788] [C loss: 0.0101]
Epoch 80/200: [D loss: -0.5998] [G loss: -0.0851] [C loss: 0.0088]
Epoch 90/200: [D loss: -0.5438] [G loss: -0.0609] [C loss: 0.0061]
Epoch 100/200: [D loss: -0.5389] [G loss: -0.1377] [C loss: 0.0046]
Epoch 110/200: [D loss: -0.5233] [G loss: -0.2638] [C loss: 0.0039]
Epoch 120/200: [D loss: -0.5171] [G loss: -0.2334] [C loss: 0.0036]
Epoch 130/200: [D loss: -0.5011] [G loss: -0.2618] [C loss: 0.0028]
Epoch 140/200: [D loss: -0.4998] [G loss: -0.2296] [C loss: 0.0025]
Epoch 150/200: [D loss: -0.4928] [G loss: -0.1326] [C loss

In [6]:
pd.DataFrame(synthetic_data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39.121387,1.427392,359870.28125,1.907677,13.62277,2.191186,3.899609,2.159438,1.168387,1.987353,2386.218262,50.113449,41.915833,0.12491,1.0
1,63.041443,0.646185,81062.609375,1.756954,11.216262,1.069428,1.834284,4.149312,1.100003,1.999473,1550.688721,260.517151,41.451347,0.67141,1.0
2,43.268234,0.392172,438285.8125,2.473482,9.074927,3.181431,6.842545,3.132291,2.918462,1.994595,2721.22583,69.844307,41.633911,0.052318,1.0
3,61.228054,1.683767,310968.90625,10.798241,8.841745,1.08594,1.86341,2.587733,1.139823,1.991432,2739.261963,33.719238,41.647537,0.099785,1.0
4,32.039837,2.333866,207680.5,2.283855,10.997952,3.137402,5.96736,3.813838,1.209155,1.999511,1500.981934,127.846962,41.785858,0.326328,1.0


In [7]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,6,77516,1,13,3,9,4,1,2,2174,0,40,1
1,50,2,83311,1,13,1,5,3,1,2,0,0,13,1
2,38,1,215646,4,9,2,7,4,1,2,0,0,40,1
3,53,1,234721,3,7,1,7,3,5,2,0,0,40,1
4,28,1,338409,1,13,1,6,1,5,1,0,0,40,13


In [7]:
import os
import katabatic

katabatic_path = os.path.dirname(katabatic.__file__)
print(os.listdir(katabatic_path))

models_path = os.path.join(katabatic_path, 'models')
if os.path.exists(models_path):
    print(os.listdir(models_path))
else:
    print("models directory not found")

tablegan_path = os.path.join(models_path, 'tablegan')
if os.path.exists(tablegan_path):
    print(os.listdir(tablegan_path))
else:
    print("tablegan directory not found")

['cities_demo.csv', 'importer.py', 'Iris.csv', 'katabatic.py', 'katabatic_config.json', 'katabatic_spi.py', 'metrics', 'models', 'test2.py', 'utils', '__init__.py', '__pycache__']
['ctgan', 'ganblr', 'meg', 'model_template', 'TableGAN', 'tvae', '__init__.py', '__pycache__']
['tablegan.py', 'tablegan_adapter.py', 'tablegan_utils.py', '__init__.py']


In [2]:
from katabatic.models.TableGAN import TableGANAdapter, TableGAN, preprocess_data, postprocess_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load data
data_path = 'data/Adult/train_Adult_cleaned.csv'
labels_path = 'data/Adult/train_Adult_labels.csv'

# Load features and labels
X = pd.read_csv(data_path)
y = pd.read_csv(labels_path, header=None)

# If y has multiple columns, assume the first column is the target
if y.shape[1] > 1:
    y = y.iloc[:, 0]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Function to identify numerical columns
def is_numerical(dtype):
    return dtype.kind in 'iuf'

# Identify numerical columns
column_is_numerical = X_train.dtypes.apply(is_numerical).values
numerical_columns = np.argwhere(column_is_numerical).ravel()

# Initialize the adapter with a specific privacy setting
tablegan_adapter = TableGANAdapter(type='mixed', privacy_setting='high')

# Fit the model
tablegan_adapter.fit(X_train, y_train, epochs=200, batch_size=64)

# Generate synthetic data
synthetic_data = tablegan_adapter.generate(size=1000)

# Print shapes to verify
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of synthetic data:", synthetic_data.shape)

# Display the first few rows of the synthetic data
pd.DataFrame(synthetic_data).head()

---FIT TableGAN Model with high privacy setting
---Initialise TableGAN Model


  y = column_or_1d(y, warn=True)


Epoch 10/200: [D loss: -6.0872] [G loss: 12.9314] [C loss: 0.5011]
Epoch 20/200: [D loss: -4.4862] [G loss: 9.4993] [C loss: 0.2545]
Epoch 30/200: [D loss: -1.8858] [G loss: 0.9433] [C loss: 0.1350]
Epoch 40/200: [D loss: -1.5708] [G loss: 0.2090] [C loss: 0.0813]
Epoch 50/200: [D loss: -1.4518] [G loss: 0.1704] [C loss: 0.0526]
Epoch 60/200: [D loss: -1.0858] [G loss: 0.1276] [C loss: 0.0413]
Epoch 70/200: [D loss: -0.9020] [G loss: 0.2798] [C loss: 0.0304]
Epoch 80/200: [D loss: -0.9490] [G loss: 0.4647] [C loss: 0.0251]
Epoch 90/200: [D loss: -0.9015] [G loss: 0.5510] [C loss: 0.0207]
Epoch 100/200: [D loss: -0.8175] [G loss: 0.4248] [C loss: 0.0197]
Epoch 110/200: [D loss: -0.7717] [G loss: 0.2299] [C loss: 0.0159]
Epoch 120/200: [D loss: -0.7385] [G loss: 0.4194] [C loss: 0.0109]
Epoch 130/200: [D loss: -0.7586] [G loss: 0.3148] [C loss: 0.0092]
Epoch 140/200: [D loss: -0.7270] [G loss: 0.5133] [C loss: 0.0083]
Epoch 150/200: [D loss: -0.7013] [G loss: 0.4182] [C loss: 0.0074]
Epo

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,25.68671,5.114521,210898.953125,6.770069,7.573856,3.380289,9.369243,3.609228,1.699542,1.995605,3653.46582,154.865189,40.637207,0.879652,0.0
1,50.886093,0.960232,260874.59375,5.175142,8.424698,4.416758,7.411941,4.56381,1.837688,1.008307,2824.746094,18.07225,40.245911,0.66215,0.0
2,29.726336,5.242323,164565.84375,2.602961,9.732572,2.883189,9.551721,4.521248,2.629245,1.933455,11014.456055,271.148468,40.806549,2.036694,0.0
3,57.656387,2.477122,139024.328125,3.79808,13.337752,1.021181,2.971313,1.834893,1.00522,1.99512,2998.986572,106.892059,40.507381,4.283048,0.0
4,27.148123,1.131275,135529.015625,7.359543,12.470115,1.887117,3.401529,3.546858,1.012024,1.998767,1768.729248,147.952225,40.721436,1.834941,0.0


In [3]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
440,29,1,363425,1,13,3,6,4,1,2,0,0,40,1
573,37,1,22463,7,11,1,2,3,1,2,0,1977,40,1
946,40,1,184682,2,10,2,9,6,1,1,0,0,40,1
997,48,4,33109,1,13,2,5,6,1,2,0,0,58,1
503,58,1,180980,2,10,2,3,6,1,1,0,0,42,23


In [11]:
from katabatic.models.TableGAN import TableGANAdapter, TableGAN, preprocess_data, postprocess_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



# Initialize the adapter with a specific privacy setting
tablegan_adapter = TableGANAdapter(type='mixed', privacy_setting='high')

# Fit the model
tablegan_adapter.fit(X_train, y_train, epochs=200, batch_size=64)

# Define discrete columns
discrete_columns = ['age', 'workclass', 'education', 'education-num', 'marital-status']  # Add all discrete columns

# Initialize scaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(X_train)

# Generate synthetic data
n_samples = 1000
synthetic_data = tablegan_adapter.generate(size=n_samples)

# Post-process discrete columns
# Check the shape of synthetic data
print("Shape of synthetic data:", synthetic_data.shape)
print("Shape of X_train:", X_train.shape)

pd.DataFrame(synthetic_data).head()

# Ensure synthetic_data has the same number of features as X_train
if synthetic_data.shape[1] != X_train.shape[1]:
    print("Warning: Mismatch in number of features. Adjusting synthetic data.")
    synthetic_data = synthetic_data[:, :X_train.shape[1]]

# Convert synthetic data to a DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=X_train.columns)

# Post-process discrete columns
for col_name in discrete_columns:
    if col_name == 'age':
        synthetic_df[col_name] = np.round(synthetic_df[col_name]).astype(int)
    else:
        synthetic_df[col_name] = np.round(synthetic_df[col_name])

# Clamp values within the original range for discrete columns
for col_name in discrete_columns:
    min_val = X_train[col_name].min()
    max_val = X_train[col_name].max()
    synthetic_df[col_name] = np.clip(synthetic_df[col_name], min_val, max_val)

# Print age statistics
print("Age range in synthetic data:", synthetic_df['age'].min(), "-", synthetic_df['age'].max())
print("Age distribution in synthetic data:")
print(synthetic_df['age'].value_counts().sort_index().head())

# Display the first few rows of the synthetic data
print(synthetic_df.head())

Shape of synthetic data: (1000, 15)
Shape of X_train: (1000, 14)
Age range in synthetic data: 18 - 70
Age distribution in synthetic data:
age
18    18
19    20
20    17
21    34
22    29
Name: count, dtype: int64
   age  workclass         fnlwgt  education  education-num  marital-status  \
0   36        1.0  148436.421875        3.0            5.0             3.0   
1   32        0.0  258493.687500        5.0           13.0             2.0   
2   25        0.0  111331.015625        1.0           13.0             1.0   
3   49        0.0  112539.156250        3.0            7.0             2.0   
4   31        0.0  280701.781250        3.0           12.0             2.0   

   occupation  relationship      race       sex  capital-gain  capital-loss  \
0    1.668044      2.665293  1.222549  1.727480   4754.707520     90.608620   
1    1.958121      4.081008  1.006382  1.999623    777.928223     90.198013   
2    5.090883      3.625424  1.005751  1.999899    409.163910     60.240929   
3 