In [32]:
import os

In [33]:
def request_paths():
    path_origin = input("Enter the origin path of the file: ")
    file_name = input("Enter the file name (with extension): ")
    path_output = input("Enter the output path: ")

    full_file_path = os.path.join(path_origin, file_name)
    full_path_output = os.path.join(path_output,'synthetic_' + file_name )
    
    # Check if the file exists
    if not os.path.exists(full_file_path):
        print(f"Error: The file '{full_file_path}' does not exist.")
        return None, None
    
    return full_file_path, full_path_output

full_file_path, full_path_output = request_paths()


### Load the data

In [34]:
import pandas as pd

data = pd.read_csv(full_file_path)

In [None]:
data.head()

### Metadata

In [36]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

# Autodetect the metadata
metadata.detect_from_dataframe(data)

In [None]:
metadata

In [None]:
metadata.visualize()

It is important to fix de sdtypes!
- *boolean*: describes columns that contain TRUE or False values.
- *categorical*: describes columns that contain distinct categories.
- *datetime*: describes columns that indicate a point of time. It requires a *datetime_format* specification.
- *numerical*: describes data with numbers. There are different *computer_representation* options: *Float*, *Int64*, ... 
- *id* : describes columns that are used to identify rows. There is an option that allows you to describe the format of the ID, *regex_format*. 

> NOTE: Personal Identifiable Information (PII) can be defined. Therefore, the synthetic data generated will contain entirely new values that don't appear in the original data. The PII information is documented as "pii": true. 

#### Fixing sdtypes

In [75]:
# Example
metadata.update_column(
    column_name='EGA_id', 
    sdtype = 'id', 
    regex_format = "^EGA_\d{5}$"
    )

In [76]:
metadata.validate()

### SDV model

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

sdv_model = GaussianCopulaSynthesizer(metadata = metadata)
sdv_model.fit(data = data)

> NOTE: There is an option to add constraints (i.e. logical rules) to the synthesizer.

In [None]:
# Generating synthetic data
number_rows = 500

synthetic_data = sdv_model.sample(num_rows = number_rows)
synthetic_data.head()

### Evaluation: Real vs. Synthetic data

#### A. Diagnostic

In [None]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic_report = run_diagnostic(
    real_data = data,
    synthetic_data = synthetic_data,
    metadata = metadata
    )

In [None]:
# Data Validity
diagnostic_report.get_details(property_name='Data Validity')

In [None]:
# Data Structure
diagnostic_report.get_details(property_name='Data Structure')

#### B. Data Quality

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data = data,
    synthetic_data = synthetic_data,
    metadata = metadata
)

### Store synthetic data

In [66]:
synthetic_data.to_csv(full_path_output, index = False)