# DataSynthesizer Usage (independent attribute mode)

> This is a quick demo to use DataSynthesizer in independent attribute mode.

### Step 1 import packages

In [None]:
pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [None]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network
import os
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Step 2 user-defined parameteres

In [None]:
# input dataset
input_data_path = '/content/drive/MyDrive/Proj_Data/Dtlz/dtlz5.csv'
# location of two output files
mode = 'independent_attribute_mode'
output_directory = f'/content/drive/MyDrive/Proj_Data/Dtlz/dtlz5/out/{mode}'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Now you can use the output directory to construct the file paths
description_file = os.path.join(output_directory, 'description.json')
synthetic_data = os.path.join(output_directory, 'synthetic_data.csv')

In [None]:
input_data = pd.read_csv(input_data_path)

# Determine characteristics of the dataset
num_tuples = len(input_data)
num_columns = len(input_data.columns)
num_unique_values = input_data.nunique()
max_unique_values = num_unique_values.max()

# Calculate threshold_value based on the dataset's characteristics
threshold_value = max(10, max_unique_values)

# Identify categorical attributes
categorical_attributes = {}
for column in input_data.columns:
    if input_data[column].dtype == 'object' or num_unique_values[column] < threshold_value:
        categorical_attributes[column] = True
    else:
        categorical_attributes[column] = False

# Identify candidate keys
candidate_keys = {}
for column in input_data.columns:
    # You may need to adjust the condition based on your dataset's characteristics
    if column.lower() == 'id' or column.lower().endswith('_id'):
        candidate_keys[column] = True
    else:
        candidate_keys[column] = False

# Set num_tuples_to_generate to match the number of tuples in the input dataset
num_tuples_to_generate = num_tuples

In [None]:
# Generate and save synthetic datasets 20 times
for i in range(1, 21):
    describer = DataDescriber(category_threshold=threshold_value)
    describer.describe_dataset_in_independent_attribute_mode(dataset_file=input_data_path,
                                                             attribute_to_is_categorical=categorical_attributes,
                                                             attribute_to_is_candidate_key=candidate_keys)
    describer.save_dataset_description_to_file(description_file)

    generator = DataGenerator()
    synthetic_data_filename = f'synthetic_data_mutated_{i}.csv'
    synthetic_data_path = os.path.join(output_directory, synthetic_data_filename)
    generator.generate_dataset_in_independent_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthetic_data_path)