In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [41]:
import numpy as np
import pandas as pd
from collections import namedtuple
from rdt.transformers import ClusterBasedNormalizer, OneHotEncoder

# Assuming the DataTransformer class is in a module called data_transformer
from transformer import DataTransformer

# Create a simple test dataset
np.random.seed(42)  # For reproducibility

# Create a simple dataframe with both continuous and discrete columns
data = pd.DataFrame({
    'age': np.random.normal(35, 10, 1000).clip(18, 80).astype(int),
    'income': np.random.lognormal(10, 1, 1000),
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 1000),
    'marital_status': np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], 1000),
    'num_children': np.random.poisson(1.5, 1000)
})

print("Original data sample:")
print(data.head())
print("\nData types:")
print(data.dtypes)

# Define discrete columns
discrete_columns = ['education', 'marital_status', 'num_children']

# Initialize and fit the DataTransformer
transformer = DataTransformer(max_clusters=5, weight_threshold=0.01)
transformer.fit(data, discrete_columns=discrete_columns)

# Display transformation information
print("\nColumn transformation info:")
for info in transformer._column_transform_info_list:
    print(f"Column: {info.column_name}, Type: {info.column_type}, Output dimensions: {info.output_dimensions}")

# Transform the data
transformed_data = transformer.transform(data)
print("\nTransformed data shape:", transformed_data.shape)
print("First row of transformed data:", transformed_data[0])

# Inverse transform the data
recovered_data = transformer.inverse_transform(transformed_data)
print("\nRecovered data sample:")
print(recovered_data.head())

# Test with added noise (sigmas)
sigmas = np.ones(transformer.output_dimensions) * 0.1
noisy_recovered_data = transformer.inverse_transform(transformed_data, sigmas=sigmas)
print("\nNoisy recovered data sample:")
print(noisy_recovered_data.head())

# Test convert_column_name_value_to_id
try:
    column_info = transformer.convert_column_name_value_to_id('education', 'Bachelor')
    print("\nColumn ID info for 'education' with value 'Bachelor':")
    print(column_info)
except ValueError as e:
    print(f"Error: {e}")

# Test with numpy array instead of DataFrame
numpy_data = data.to_numpy()
numpy_transformer = DataTransformer(max_clusters=5, weight_threshold=0.01)
numpy_transformer.fit(numpy_data, discrete_columns=[2, 3, 4])  # Column indices for discrete columns

numpy_transformed = numpy_transformer.transform(numpy_data)
print("\nNumPy transformed data shape:", numpy_transformed.shape)

numpy_recovered = numpy_transformer.inverse_transform(numpy_transformed)
print("NumPy recovered data shape:", numpy_recovered.shape)


Original data sample:
   age        income education marital_status  num_children
0   39  89264.168397  Bachelor        Widowed             1
1   33  55527.500577    Master       Divorced             1
2   41  23379.862874  Bachelor       Divorced             2
3   50  11534.100901    Master       Divorced             1
4   32  44277.118652    Master        Married             1

Data types:
age                 int64
income            float64
education          object
marital_status     object
num_children        int32
dtype: object

Column transformation info:
Column: age, Type: continuous, Output dimensions: 6
Column: income, Type: continuous, Output dimensions: 5
Column: education, Type: discrete, Output dimensions: 4
Column: marital_status, Type: discrete, Output dimensions: 4
Column: num_children, Type: discrete, Output dimensions: 7

Transformed data shape: (1000, 26)
First row of transformed data: [0.08977495 0.         1.         0.         0.         0.
 0.06426067 0.         