Detect the metadata

In [3]:
import pandas as pd

df = pd.read_csv("/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Folds/train_data1.csv")
df = df[['Schedule_category', 'Time_Overrun_category', 'Budget_category', 'Type', 'Labor_Risk', 
         'Analysis_Risk', 'Decision_Risk', 'Engineering_Risk', 'Provider_Risk', 'Weather_Risk', 'Fabrication_Risk']]

# categorical_columns = ['Schedule_category', 'Time_Overrun_category', 'Budget_category', 'Type']
# for col in categorical_columns:
#     df[col] = df[col].astype('category')

from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)
print('Auto detected data:\n')
metadata

Auto detected data:



{
    "columns": {
        "Schedule_category": {
            "sdtype": "categorical"
        },
        "Time_Overrun_category": {
            "sdtype": "categorical"
        },
        "Budget_category": {
            "sdtype": "categorical"
        },
        "Type": {
            "sdtype": "categorical"
        },
        "Labor_Risk": {
            "sdtype": "numerical"
        },
        "Analysis_Risk": {
            "sdtype": "numerical"
        },
        "Decision_Risk": {
            "sdtype": "numerical"
        },
        "Engineering_Risk": {
            "sdtype": "numerical"
        },
        "Provider_Risk": {
            "sdtype": "numerical"
        },
        "Weather_Risk": {
            "sdtype": "numerical"
        },
        "Fabrication_Risk": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [4]:
from sdv.single_table import TVAESynthesizer
from rdt.transformers.categorical import OrderedLabelEncoder, LabelEncoder, OrderedUniformEncoder
from sdv.single_table import TVAESynthesizer
from sdv.evaluation.single_table import evaluate_quality
from sdv.metadata import SingleTableMetadata
from pathlib import Path
import copy
import re

synthesizer = TVAESynthesizer(
    metadata,
    batch_size=8,  # smaller batch size, should be evenly divisible by pac
    #pac=2, # larger means each individual's data in the training set has less influence on the output, which makes it harder to reverse-engineer the initial training data from the synthetic data.
    compress_dims=(64, 64),
    decompress_dims=(64, 64),
    embedding_dim=64,  # smaller for a smaller dataset
    epochs=3000,   # More epochs for small datasets
    loss_factor=6.0,  # Can be increased if synthetic data is not similar to original
)

synthesizer.load_custom_constraint_classes(
    filepath='/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/custom_constraint_template.py',
    class_names=['RiskOverrunConstraint']
)

my_constraint = {
    'constraint_class': 'RiskOverrunConstraint',
    'constraint_parameters': {
        'column_names': ['Labor_Risk', 'Analysis_Risk', 'Decision_Risk', 'Engineering_Risk', 'Provider_Risk', 'Weather_Risk', 'Fabrication_Risk'],
        'extra_parameter': None
    }
}
synthesizer.add_constraints([my_constraint])

base_dir = Path("/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Folds")
output_dir = Path("/Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation")

# Define training dataset pattern
train_pattern = re.compile(r'train_data\d+\.csv')

# Initialize list to store quality reports
quality_scores = []

# Loop over each dataset that matches the pattern and excludes non-training data
for dataset_path in base_dir.glob('*.csv'):
    if train_pattern.match(dataset_path.name):
        # Dataset number
        dataset_number = dataset_path.stem.replace('train_data', '')
        df = pd.read_csv(dataset_path)

        synth = copy.deepcopy(synthesizer)

        synth.fit(df)

        # create new transformer objects
        cattime_transformer = OrderedUniformEncoder(order=['[0,0.16]', '(0.16,0.26]','(0.26,1]'])
        catcost_transformer = OrderedUniformEncoder(order=['Small', 'Medium', 'Large'])
        catoverrun_transformer = OrderedUniformEncoder(order=['[0,0.12]','(0.12,0.43]','(0.43,1]'])
        type_transformer = LabelEncoder()

        synth.update_transformers(column_name_to_transformer={
        'Schedule_category': cattime_transformer,
        'Time_Overrun_category': catoverrun_transformer,
        'Budget_category': catcost_transformer,
        'Type': type_transformer,
        'Labor_Risk': None,
        'Analysis_Risk': None,
        'Decision_Risk': None,
        'Engineering_Risk': None,
        'Provider_Risk': None,
        'Weather_Risk': None,
        'Fabrication_Risk': None
        })

        synth.fit(df)
        
        syn = synth.sample(num_rows=365)
        quality_report = evaluate_quality(real_data=df, synthetic_data=syn, metadata=metadata)
        quality_scores.append(quality_report)

        aug = syn.append(df)

        output_filename = f"augmented_data{dataset_number}.csv"
        output_filepath = output_dir / output_filename
        aug.to_csv(output_filepath, index=False)
        print(f"Augmented dataset saved to {output_filepath}")



Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 5112.70it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 50.60it/s]
  aug = syn.append(df)



Overall Quality Score: 95.57%

Properties:
Column Shapes: 96.27%
Column Pair Trends: 94.87%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data1.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 5267.12it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 49.76it/s]
  aug = syn.append(df)



Overall Quality Score: 94.81%

Properties:
Column Shapes: 94.99%
Column Pair Trends: 94.62%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data3.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 7297.12it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 53.15it/s]
  aug = syn.append(df)



Overall Quality Score: 95.11%

Properties:
Column Shapes: 96.01%
Column Pair Trends: 94.2%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data2.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 7368.84it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 53.13it/s]
  aug = syn.append(df)



Overall Quality Score: 94.58%

Properties:
Column Shapes: 95.67%
Column Pair Trends: 93.5%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data6.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 7196.07it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 51.23it/s]
  aug = syn.append(df)



Overall Quality Score: 95.14%

Properties:
Column Shapes: 95.74%
Column Pair Trends: 94.55%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data7.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 7230.02it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 52.11it/s]
  aug = syn.append(df)



Overall Quality Score: 93.18%

Properties:
Column Shapes: 94.07%
Column Pair Trends: 92.28%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data5.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 7278.59it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 52.94it/s]
  aug = syn.append(df)



Overall Quality Score: 94.48%

Properties:
Column Shapes: 94.94%
Column Pair Trends: 94.01%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data4.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 7232.58it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 51.58it/s]
  aug = syn.append(df)



Overall Quality Score: 95.24%

Properties:
Column Shapes: 96.06%
Column Pair Trends: 94.41%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data9.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 5104.62it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 52.06it/s]
  aug = syn.append(df)



Overall Quality Score: 94.85%

Properties:
Column Shapes: 95.52%
Column Pair Trends: 94.18%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data8.csv


Sampling rows: 100%|██████████| 365/365 [00:00<00:00, 7065.84it/s]
Creating report: 100%|██████████| 4/4 [00:00<00:00, 50.93it/s]



Overall Quality Score: 94.89%

Properties:
Column Shapes: 95.94%
Column Pair Trends: 93.84%
Augmented dataset saved to /Users/camilodavid/Library/CloudStorage/OneDrive-Personal/Tohoku U/PhD/Papers/4rd Paper/Code/Seed1/mineros/AI sampling/Augmentation/augmented_data10.csv


  aug = syn.append(df)
