In [22]:
import pandas as pd

df = pd.read_csv("/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/data.csv")
df = df[~df['ID'].isin([182, 183])]
df = df[['Schedule_category', 'Time_Overrun_category', 'Budget_category', 'Type', 'Labor_Risk', 
         'Analysis_Risk', 'Decision_Risk', 'Engineering_Risk', 'Provider_Risk', 'Weather_Risk', 'Fabrication_Risk']]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81 entries, 0 to 82
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Schedule_category      81 non-null     object
 1   Time_Overrun_category  81 non-null     int64 
 2   Budget_category        81 non-null     object
 3   Type                   81 non-null     object
 4   Labor_Risk             81 non-null     int64 
 5   Analysis_Risk          81 non-null     int64 
 6   Decision_Risk          81 non-null     int64 
 7   Engineering_Risk       81 non-null     int64 
 8   Provider_Risk          81 non-null     int64 
 9   Weather_Risk           81 non-null     int64 
 10  Fabrication_Risk       81 non-null     int64 
dtypes: int64(8), object(3)
memory usage: 7.6+ KB


Detect the metadata

In [23]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)
print('Auto detected data:\n')
metadata

Auto detected data:



{
    "columns": {
        "Schedule_category": {
            "sdtype": "categorical"
        },
        "Time_Overrun_category": {
            "sdtype": "numerical"
        },
        "Budget_category": {
            "sdtype": "categorical"
        },
        "Type": {
            "sdtype": "categorical"
        },
        "Labor_Risk": {
            "sdtype": "numerical"
        },
        "Analysis_Risk": {
            "sdtype": "numerical"
        },
        "Decision_Risk": {
            "sdtype": "numerical"
        },
        "Engineering_Risk": {
            "sdtype": "numerical"
        },
        "Provider_Risk": {
            "sdtype": "numerical"
        },
        "Weather_Risk": {
            "sdtype": "numerical"
        },
        "Fabrication_Risk": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

First let's create a simple synthesizer with the default values of the TVAE algorithm

In [24]:
from sdv.single_table import TVAESynthesizer

synthesizer_1 = TVAESynthesizer(
    metadata, # required
    enforce_min_max_values=True, # Irrelevant in this case
    enforce_rounding=True, # Irrelevant in this case
    epochs=300 # This is the default value used for the data augmentation with the Benchmark BNs
)
synthesizer_1.fit(df)



Create the synthetic data and make the augmented dataset and let's calculate the similarity score

In [25]:
synthesizer_1.reset_sampling()
syn_1 = synthesizer_1.sample(num_rows=300)

aug_1 = syn_1.append(df)

from sdv.evaluation.single_table import evaluate_quality
from sdv.metadata import SingleTableMetadata

quality_report_TVAE_1 = evaluate_quality(
    real_data=df,
    synthetic_data=aug_1,
    metadata=metadata)
quality_report_TVAE_1

  aug_1 = syn_1.append(df)
Creating report: 100%|██████████| 4/4 [00:00<00:00, 47.96it/s]



Overall Quality Score: 82.49%

Properties:
Column Shapes: 84.61%
Column Pair Trends: 80.37%


<sdmetrics.reports.single_table.quality_report.QualityReport at 0x28011bd60>

We see that the similarity of the augmented data set is a bit low and might introduce noise in the learning of the BN parameters. 
So we make some adjustments

In [26]:
from sdv.single_table import TVAESynthesizer

synthesizer_2 = TVAESynthesizer(
    metadata,
    batch_size=8,  # smaller batch size, should be evenly divisible by pac
    #pac=2, # larger means each individual's data in the training set has less influence on the output, which makes it harder to reverse-engineer the initial training data from the synthetic data.
    compress_dims=(64, 64),
    decompress_dims=(64, 64),
    embedding_dim=64,  # smaller for a smaller dataset
    epochs=2000,   # More epochs for small datasets
    loss_factor=4.0,  # Can be increased if synthetic data is not similar to original
)
synthesizer_2.fit(df)



Create the synthetic data and make the augmented dataset and let's calculate the similarity score

In [27]:
synthesizer_2.reset_sampling()
syn_2 = synthesizer_2.sample(num_rows=300)

aug_2 = syn_2.append(df)

quality_report_TVAE_2 = evaluate_quality(
    real_data=df,
    synthetic_data=aug_2,
    metadata=metadata)
quality_report_TVAE_2

  aug_2 = syn_2.append(df)
Creating report: 100%|██████████| 4/4 [00:00<00:00, 47.66it/s]



Overall Quality Score: 95.71%

Properties:
Column Shapes: 96.46%
Column Pair Trends: 94.96%


<sdmetrics.reports.single_table.quality_report.QualityReport at 0x280035cd0>

The results are better. Based on the results obtained with the Benchmark BNs, this could be enough to not degrade the learning of the parameters for our BN.
However we can try to get better results in terms of the similarity score.
We will adjust the Transformers used to better fit our data. Especially those regarding the categorical variables which should be taken as ordered categories.
But first we need to check out which transformations are being applied by default.

In [28]:
synthesizer_2.get_transformers()

{'Schedule_category': None,
 'Time_Overrun_category': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Budget_category': None,
 'Type': None,
 'Labor_Risk': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Analysis_Risk': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Decision_Risk': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Engineering_Risk': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Provider_Risk': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Weather_Risk': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Fabrication_Risk': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True)}

In [32]:
from rdt.transformers.categorical import OrderedLabelEncoder, LabelEncoder
from sdv.single_table import TVAESynthesizer

synthesizer_3 = TVAESynthesizer(
    metadata,
    batch_size=8,  # smaller batch size, should be evenly divisible by pac
    #pac=2, # larger means each individual's data in the training set has less influence on the output, which makes it harder to reverse-engineer the initial training data from the synthetic data.
    compress_dims=(64, 64),
    decompress_dims=(64, 64),
    embedding_dim=64,  # smaller for a smaller dataset
    epochs=2000,   # More epochs for small datasets
    loss_factor=4.0,  # Can be increased if synthetic data is not similar to original
)

synthesizer_3.fit(df)

# create new transformer objects
cattime_transformer = OrderedLabelEncoder(order=['Small', 'Medium', 'Large'], add_noise=True)
catcost_transformer = OrderedLabelEncoder(order=['Small', 'Medium', 'Large'], add_noise=True)
type_transformer = LabelEncoder(add_noise=True)

synthesizer_3.update_transformers(column_name_to_transformer={
  'Schedule_category': cattime_transformer,
  'Time_Overrun_category': None,
  'Budget_category': catcost_transformer,
  'Type': type_transformer,
  'Labor_Risk': None,
  'Analysis_Risk': None,
  'Decision_Risk': None,
  'Engineering_Risk': None,
  'Provider_Risk': None,
  'Weather_Risk': None,
  'Fabrication_Risk': None
})

synthesizer_3.get_transformers()
synthesizer_3.fit(df)



And we can fit the data again. We should see a small improvement in the similarity.

In [33]:
synthesizer_3.reset_sampling()
syn_3 = synthesizer_3.sample(num_rows=300)

aug_3 = syn_3.append(df)

quality_report_TVAE_3 = evaluate_quality(
    real_data=df,
    synthetic_data=aug_3,
    metadata=metadata)
quality_report_TVAE_3

  aug_3 = syn_3.append(df)
Creating report: 100%|██████████| 4/4 [00:00<00:00, 46.00it/s]



Overall Quality Score: 96.46%

Properties:
Column Shapes: 96.89%
Column Pair Trends: 96.03%


<sdmetrics.reports.single_table.quality_report.QualityReport at 0x2943be100>

In [36]:
from rdt.transformers.categorical import OrderedLabelEncoder
from sdv.single_table import TVAESynthesizer

synthesizer_4 = TVAESynthesizer(
    metadata,
    batch_size=8,  # smaller batch size, should be evenly divisible by pac
    #pac=2, # larger means each individual's data in the training set has less influence on the output, which makes it harder to reverse-engineer the initial training data from the synthetic data.
    compress_dims=(64, 64),
    decompress_dims=(64, 64),
    embedding_dim=64,  # smaller for a smaller dataset
    epochs=2000,   # More epochs for small datasets
    loss_factor=4.0,  # Can be increased if synthetic data is not similar to original
)

synthesizer_4.load_custom_constraint_classes(
    filepath='/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/custom_constraint_template.py',
    class_names=['RiskOverrunConstraint']
)

my_constraint = {
    'constraint_class': 'RiskOverrunConstraint',
    'constraint_parameters': {
        'column_names': ['Labor_Risk', 'Analysis_Risk', 'Decision_Risk', 'Engineering_Risk', 'Provider_Risk', 'Weather_Risk', 'Fabrication_Risk'],
        'extra_parameter': None
    }
}
synthesizer_4.add_constraints([my_constraint])

synthesizer_4.fit(df)



In [37]:
# create new transformer objects
cattime_transformer = OrderedLabelEncoder(order=['Small', 'Medium', 'Large'], add_noise=True)
catcost_transformer = OrderedLabelEncoder(order=['Small', 'Medium', 'Large'], add_noise=True)
type_transformer = LabelEncoder(add_noise=True)

synthesizer_4.update_transformers(column_name_to_transformer={
  'Schedule_category': cattime_transformer,
  'Time_Overrun_category': None,
  'Budget_category': catcost_transformer,
  'Type': type_transformer,
  'Labor_Risk': None,
  'Analysis_Risk': None,
  'Decision_Risk': None,
  'Engineering_Risk': None,
  'Provider_Risk': None,
  'Weather_Risk': None,
  'Fabrication_Risk': None
})

synthesizer_4.fit(df)



In [38]:
synthesizer_4.reset_sampling()
syn_4 = synthesizer_4.sample(num_rows=300)

aug_4 = syn_4.append(df)

from sdv.evaluation.single_table import evaluate_quality
from sdv.metadata import SingleTableMetadata

quality_report_TVAE_4 = evaluate_quality(
    real_data=df,
    synthetic_data=aug_4,
    metadata=metadata)
quality_report_TVAE_4

Sampling rows: 100%|██████████| 300/300 [00:00<00:00, 2608.94it/s]
  aug_4 = syn_4.append(df)
Creating report: 100%|██████████| 4/4 [00:00<00:00, 50.65it/s]



Overall Quality Score: 95.67%

Properties:
Column Shapes: 95.95%
Column Pair Trends: 95.4%


<sdmetrics.reports.single_table.quality_report.QualityReport at 0x289746490>

In [39]:
aug_4.to_csv("/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/aug.csv")

Let us plot the DGMs

In [1]:
synthesizer_1.plot()


NameError: name 'synthesizer_1' is not defined