In [11]:
import pandas as pd

df = pd.read_csv("/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/discrete.csv")

from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)
print('Auto detected data:\n')
metadata

Auto detected data:



{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "Budget_category": {
            "sdtype": "categorical"
        },
        "Type": {
            "sdtype": "categorical"
        },
        "Labor_Risk": {
            "sdtype": "numerical"
        },
        "Analysis_Risk": {
            "sdtype": "numerical"
        },
        "Decision_Risk": {
            "sdtype": "numerical"
        },
        "Engineering_Risk": {
            "sdtype": "numerical"
        },
        "Provider_Risk": {
            "sdtype": "numerical"
        },
        "Weather_Risk": {
            "sdtype": "numerical"
        },
        "Fabrication_Risk": {
            "sdtype": "numerical"
        },
        "Schedule_category": {
            "sdtype": "categorical"
        },
        "Time_Overrun_category": {
            "sdtype": "categorical"
        }
    }
}

First let's create a simple synthesizer with the default values of the TVAE algorithm

In [12]:
from sdv.single_table import TVAESynthesizer
from rdt.transformers.categorical import LabelEncoder, OrderedUniformEncoder
from sdv.single_table import TVAESynthesizer
from sdv.evaluation.single_table import evaluate_quality

synthesizer = TVAESynthesizer(
    metadata,
    batch_size=8,  # smaller batch size, should be evenly divisible by pac
    #pac=2, # larger means each individual's data in the training set has less influence on the output, which makes it harder to reverse-engineer the initial training data from the synthetic data.
    compress_dims=(64, 64),
    decompress_dims=(64, 64),
    embedding_dim=64,  # smaller for a smaller dataset
    epochs=3000,   # More epochs for small datasets
    loss_factor=6.0,  # Can be increased if synthetic data is not similar to original
)

synthesizer.load_custom_constraint_classes(
    filepath='/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/custom_constraint_template.py',
    class_names=['RiskOverrunConstraint']
)

my_constraint = {
    'constraint_class': 'RiskOverrunConstraint',
    'constraint_parameters': {
        'column_names': ['Labor_Risk', 'Analysis_Risk', 'Decision_Risk', 'Engineering_Risk', 'Provider_Risk', 'Weather_Risk', 'Fabrication_Risk'],
        'extra_parameter': None
    }
}

synthesizer.add_constraints([my_constraint])

synthesizer.fit(df)

cattime_transformer = OrderedUniformEncoder(order=['[0,0.16]', '(0.16,0.26]','(0.26,1]'])
catcost_transformer = OrderedUniformEncoder(order=['Small', 'Medium', 'Large'])
catoverrun_transformer = OrderedUniformEncoder(order=['[0,0.12]','(0.12,0.43]','(0.43,1]'])
type_transformer = LabelEncoder()

synthesizer.update_transformers(column_name_to_transformer={
'Schedule_category': cattime_transformer,
'Time_Overrun_category': catoverrun_transformer,
'Budget_category': catcost_transformer,
'Type': type_transformer,
'Labor_Risk': None,
'Analysis_Risk': None,
'Decision_Risk': None,
'Engineering_Risk': None,
'Provider_Risk': None,
'Weather_Risk': None,
'Fabrication_Risk': None
})

synthesizer.fit(df)




In [13]:
syn = synthesizer.sample(num_rows=405)
aug = syn.append(df)
aug.to_csv("/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/aug.csv", index=False)

quality_report = evaluate_quality(real_data=df, synthetic_data=aug, metadata=metadata)

Sampling rows: 100%|██████████| 405/405 [00:00<00:00, 7245.22it/s]
  aug = syn.append(df)
Creating report: 100%|██████████| 4/4 [00:00<00:00, 52.10it/s]



Overall Quality Score: 95.85%

Properties:
Column Shapes: 96.69%
Column Pair Trends: 95.01%


Let us do conditional sampling with the conditions of the two projects that we will use to illustrate the usefulness of this approach

In [26]:
from sdv.sampling import Condition

project_182_183 = Condition(
    num_rows=405,
    column_values={'Budget_category': 'Medium', 'Schedule_category': '[0,0.16]', 'Type': 'Sustainment'}
)

conditional_data = synthesizer.sample_from_conditions(
    conditions=[project_182_183],
    output_file_path="/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/conditional_data.csv"
)

Sampling conditions: 100%|██████████| 405/405 [00:00<00:00, 2148.78it/s]


In [25]:
project_183 = Condition(
    num_rows=10000,
    column_values={'Budget_category': 'Medium', 'Schedule_category': '[0,0.16]', 'Type': 'Sustainment', 'Provider_Risk' : 1}
)

conditional_data2 = synthesizer.sample_from_conditions(
    conditions=[project_183],
    output_file_path="/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/conditional_data2.csv"
)

Sampling conditions: 100%|██████████| 10000/10000 [00:03<00:00, 2794.62it/s]
