In [1]:
import pandas as pd
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer

In [2]:
df = pd.read_csv("penguins.csv").drop(["ID"], axis=1)

In [3]:
md = Metadata.detect_from_dataframe(df)

In [4]:
synthesizer = GaussianCopulaSynthesizer(metadata=md)


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [5]:
mass_constraint = {
    "constraint_class": "FixedIncrements",
    "constraint_parameters": {"column_name": "Body Mass (g)", "increment_value": 5},
}

In [6]:
synthesizer.add_constraints(constraints=[mass_constraint])

In [7]:
synthesizer.get_constraints()

[{'constraint_class': 'FixedIncrements',
  'constraint_parameters': {'column_name': 'Body Mass (g)',
   'increment_value': 5}}]

In [8]:
synthesizer.fit(df)

In [9]:
synthesizer.get_transformers()

{'Species': UniformEncoder(),
 'Island': UniformEncoder(),
 'Sex': UniformEncoder(),
 'Culmen Length (mm)': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Culmen Depth (mm)': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Flipper Length (mm)': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Body Mass (g)': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True)}

In [10]:
synthesizer.get_parameters()

{'enforce_min_max_values': True,
 'enforce_rounding': True,
 'locales': ['en_US'],
 'numerical_distributions': {},
 'default_distribution': 'beta'}

In [12]:
syn_data = synthesizer.sample(df.shape[0])

  0%|          | 0/333 [00:00<?, ?it/s]

Sampling rows: 100%|██████████| 333/333 [00:00<00:00, 6986.56it/s]


In [13]:
syn_data.to_csv("penguins_synthetic.csv", index=False)