In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import networkx as nx
import datgan
import json

# Set the TF_GPU_ALLOCATOR environment variable
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

### Enable all GPUs (reset to default)
# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices(physical_devices, 'GPU')

# Disable all GPUs
tf.config.set_visible_devices([], 'GPU')

#TF version
print(f"Tensorflow Version {tf.__version__}")

# DEFINE DATGAN VERSION

### Set Foldername variable

In [None]:
# ciDatGan_path = '/home/s212574/snap/snapd-desktop-integration/83/Documents/Thesis/MSc_PopSyn/Sigga_Luis/Data/'
ciDatGan_path = '/Users/luis/MasterThesis/MSc_PopSyn/Sigga_Luis/Data/'
os.chdir(ciDatGan_path)

# folder name
folder_name = 'FolderName' # Locate folder where all the data is stored and results can be saved.

In [None]:
# Load the data

os.chdir(ciDatGan_path + folder_name)
print(os.getcwd())
# Load the data dtypes dictionary 
file_path = 'df_dtypes.json'
with open(file_path, 'r') as file:
    loaded_df_types = json.load(file)

# Now, 'loaded_dict' contains the dictionary loaded from the JSON file
print(loaded_df_types)

# Load the relation data 
file_path = 'relations.json'  # Define the file path
with open(file_path, 'r') as file:
    loaded_relations = json.load(file)

# Convert the loaded data back to tuples
relations = [tuple(rel) for rel in loaded_relations]


# Load the meta data
file_path = 'data_info.json'  # Replace with your file path
with open(file_path, 'r') as file:
    loaded_data_info = json.load(file)

# Now, 'loaded_data_info' contains the dictionary loaded from the JSON file
print(loaded_data_info)


# Load the graph
graph = nx.read_adjlist('graph.adjlist', create_using=nx.DiGraph)


# Load the train and test data
# Load train data
train_set = pd.read_csv('trainData.csv', sep=',', dtype=loaded_df_types)
train_set.drop(train_set.columns[train_set.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)
# Load test data
test_set = pd.read_csv('testData.csv', sep=',', dtype=loaded_df_types)
test_set.drop(test_set.columns[test_set.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)

# Check the shapes of your sets
print("Training set shape:", train_set.shape)
print("Testing set shape:", test_set.shape)

# Training


In [None]:
### ciDatGan
from datgan import DATGAN

os.chdir(os.path.join(ciDatGan_path, folder_name, 'ciDatGan'))

# Define the conditional inputs
conditional_inputs = ['RespAgeCorrect','Gender','PopSocio','MunicipalityOrigin']

batch_size = 1116

ciDatGan = DATGAN(output='./output/',
                  batch_size=batch_size,
                  num_epochs=1000,
                  conditional_inputs=conditional_inputs,
                  verbose=True)

In [None]:
# Train the ciDATGAN Model
new_datgan = ciDatGan.fit(train_set, metadata=loaded_data_info,
                          dag=graph, preprocessed_data_path='./encoded_data/')

In [None]:
# Load conditional marginal data
file = pd.read_csv('File.csv',sep=',',dtype='category')

In [None]:
file.drop(file.columns[file.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)
file = file.reset_index(drop=True)

In [None]:
file

In [None]:
# Method to sample multiple batches of data from the ciDATGAN model

from IPython.display import clear_output

def multiple_samples(data, batch_size):
    

    file_path = 'ciDatGan_synthetic_9_PoP_2050.csv'

    if os.path.isfile(file_path):
        # Get the length of the existing file
        existing_data = pd.read_csv(file_path)
        existing_length = len(existing_data)
        print(f"The existing file '{file_path}' has {existing_length} rows.")
        length = len(data) - existing_length
    else:
        length = len(data)


    num_samples = length // batch_size

    # sampled_data = []

    for i in range(num_samples):
        if i % 10 == 0:
            clear_output(wait=True)

        print(f"Batch {i+1} of {num_samples}")
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data[start_idx:end_idx]
        batch_data.reset_index(drop=True, inplace=True)
        # print(batch_data)

        sample_result = ciDatGan.sample(batch_size, inputs=batch_data, randomize=True)
        sample_result = pd.DataFrame(sample_result)

        if os.path.isfile(file_path):
            # Get the length of the existing file
            existing_data = pd.read_csv(file_path)
            existing_length = len(existing_data)
            print(f"The existing file '{file_path}' has {existing_length} rows.")

            # Append final_sampled_data to the existing file
            combined_data = pd.concat([existing_data, sample_result], ignore_index=True)
            combined_data.to_csv(file_path, index=False)
            print(f"New data has been appended to '{file_path}'.")
        else:
            # If the file doesn't exist, write final_sampled_data to a new file
            sample_result.to_csv(file_path, index=False)
            print(f"File '{file_path}' did not exist. Data has been written to a new file.")


    if len(data) % batch_size != 0:


        if os.path.isfile(file_path):
                # Get the length of the existing file
                existing_data = pd.read_csv(file_path)
                existing_length = len(existing_data)
                print(f"The existing file '{file_path}' has {existing_length} rows.")
                
                # Sample the remaining data
                print(f"Batch {num_samples+1} of {num_samples+1}")
                start_idx = existing_length
                print(start_idx)
                batch_data = data[start_idx:]
                batch_data.reset_index(drop=True, inplace=True)
                print(batch_data)

                sample_result = ciDatGan.sample(len(batch_data), inputs=batch_data, randomize=True)
                sample_result = pd.DataFrame(sample_result)



                # Append final_sampled_data to the existing file
                combined_data = pd.concat([existing_data, sample_result], ignore_index=True)
                combined_data.to_csv(file_path, index=False)
                print(f"New data has been appended to '{file_path}'.")

    return True

In [None]:
# CALL the method to sample the data

# samples = ciDatGan.sample(len(file), inputs=file, randomize=True)
# samples.to_csv('ciDatGan_synthetic.csv', index=False)

multiple_samples(file, 100000)