In [1]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import networkx as nx
import datgan
import json

# Set the TF_GPU_ALLOCATOR environment variable
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

### Enable all GPUs (reset to default)
# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices(physical_devices, 'GPU')

# Disable all GPUs
tf.config.set_visible_devices([], 'GPU')

#TF version
print(f"Tensorflow Version {tf.__version__}")

Tensorflow Version 2.15.0


# DEFINE DATGAN VERSION

### Set Foldername variable

In [2]:
DatGan_path = '/Users/luis/MScPoPSyn/PopSyn/Data/'
os.chdir(DatGan_path)

# folder name
folder_name = 'ModelVersion4' # Locate folder where all the data is stored and results can be saved.

In [3]:
# Load the data

os.chdir(DatGan_path + folder_name)
print(os.getcwd())
# Load the data dtypes dictionary 
file_path = 'df_dtypes.json'
with open(file_path, 'r') as file:
    loaded_df_types = json.load(file)

# Now, 'loaded_dict' contains the dictionary loaded from the JSON file
print(loaded_df_types)

# Load the relation data 
file_path = 'relations.json'  # Define the file path
with open(file_path, 'r') as file:
    loaded_relations = json.load(file)

# Convert the loaded data back to tuples
relations = [tuple(rel) for rel in loaded_relations]


# Load the meta data
file_path = 'data_info.json'  # Replace with your file path
with open(file_path, 'r') as file:
    loaded_data_info = json.load(file)

# Now, 'loaded_data_info' contains the dictionary loaded from the JSON file
print(loaded_data_info)


# Load the graph
graph = nx.read_adjlist('graph.adjlist', create_using=nx.DiGraph)
# graph.add_node("MissIncome")


train_set = pd.read_csv('simulationData.csv', sep=',', dtype=loaded_df_types)
train_set.drop(train_set.columns[train_set.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)

train_set.drop(columns=['SessionId'], inplace=True)

# # Load the train and test data
# # Load train data
# train_set = pd.read_csv('trainData.csv',sep=',',dtype=loaded_df_types)
# train_set.drop(train_set.columns[train_set.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
# # Load test data
# test_set= pd.read_csv('testData.csv',sep=',',dtype=loaded_df_types)
# test_set.drop(test_set.columns[test_set.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

# # Check the shapes of your sets
print("Training set shape:", train_set.shape)
# print("Testing set shape:", test_set.shape)

/Users/luis/MScPoPSyn/PopSyn/Data/ModelVersion4
{'CarModelYear': 'category', 'Education': 'category', 'FamNumAdults': 'category', 'FamNumPers': 'category', 'FuelType': 'category', 'Gender': 'category', 'Handicap': 'category', 'HomeAdrZone': 'category', 'HousehCarOwnership': 'category', 'HousehNumAdults': 'category', 'HousehNumPers': 'category', 'HousehNumcars': 'category', 'HwDayspW': 'category', 'IncFamily2000': 'int64', 'IncRespondent2000': 'int64', 'KidsBetween0and4': 'category', 'KidsBetween0and15': 'category', 'MunicipalityDest': 'category', 'MunicipalityOrigin': 'category', 'PopSocio': 'category', 'PrimOccZone': 'category', 'RespAgeCorrect': 'category', 'RespHasBicycle': 'category', 'RespHasRejsekort': 'category', 'RespIsmemCarshare': 'category', 'RespPrimOcc': 'category', 'ResphasDrivlic': 'category', 'Sector': 'category', 'WorkHourType': 'category', 'WorkHoursPw': 'category', 'Year': 'category', 'HomeParkPoss': 'category', 'HousehAccomodation': 'category', 'HousehAccOwnorRent':

In [4]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139619 entries, 0 to 139618
Data columns (total 39 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   FamNumPers           139619 non-null  category
 1   HousehNumAdults      139619 non-null  category
 2   IncFamily2000        139619 non-null  int64   
 3   WorkHoursPw          139619 non-null  category
 4   FamNumAdults         139619 non-null  category
 5   HousehNumPers        139619 non-null  category
 6   HousehCarOwnership   139619 non-null  category
 7   IncRespondent2000    139619 non-null  int64   
 8   HousehNumcars        139619 non-null  category
 9   Year                 139619 non-null  category
 10  Gender               139619 non-null  category
 11  RespAgeCorrect       139619 non-null  category
 12  Education            139619 non-null  category
 13  Handicap             139619 non-null  category
 14  PopSocio             139619 non-null  category
 15  

# Training


In [5]:
### DatGan
from datgan import DATGAN

# Change the directory to the DatGan folder
os.chdir(os.path.join(DatGan_path, folder_name, 'DatGan'))

batch_size = 1116

datgan = DATGAN(output='./output/',
                batch_size=batch_size,
                num_epochs=1000)

In [6]:
# Training of DATGAN
datgan.preprocess(data=train_set,metadata=loaded_data_info,preprocessed_data_path='./encoded_data/')

Preprocessed data have been loaded!


In [7]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139619 entries, 0 to 139618
Data columns (total 39 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   FamNumPers           139619 non-null  category
 1   HousehNumAdults      139619 non-null  category
 2   IncFamily2000        139619 non-null  int64   
 3   WorkHoursPw          139619 non-null  category
 4   FamNumAdults         139619 non-null  category
 5   HousehNumPers        139619 non-null  category
 6   HousehCarOwnership   139619 non-null  category
 7   IncRespondent2000    139619 non-null  int64   
 8   HousehNumcars        139619 non-null  category
 9   Year                 139619 non-null  category
 10  Gender               139619 non-null  category
 11  RespAgeCorrect       139619 non-null  category
 12  Education            139619 non-null  category
 13  Handicap             139619 non-null  category
 14  PopSocio             139619 non-null  category
 15  

In [8]:
# Load Model to sample from DATGAN
new_datgan = datgan.fit(train_set, metadata=loaded_data_info, dag=graph, preprocessed_data_path='./encoded_data')



Preprocessed data have been loaded!
Start training DATGAN with the WGAN loss (27/02/2024 21:10:26).


Training DATGAN: 100%|██████████| 1000/1000 [23:28:04<00:00, 84.48s/it]  

DATGAN has finished training (28/02/2024 20:38:31) - Training time: 23 hours, 28 minutes, and 04 seconds





In [17]:
# Sample from DATGAN
samples = datgan.sample(1000000)
samples.to_csv('DatGan_synthetic5.csv', index=False)

Sampling from DATGAN:  12%|█▏        | 122771/1000000 [1:19:51<9:30:39, 25.62it/s]
Sampling from DATGAN: 100%|██████████| 1000000/1000000 [1:19:03<00:00, 210.83it/s]


In [18]:
a = pd.read_csv('DatGan_synthetic.csv')
b = pd.read_csv('DatGan_synthetic1.csv')
c = pd.read_csv('DatGan_synthetic2.csv')
d = pd.read_csv('DatGan_synthetic3.csv')
e = pd.read_csv('DatGan_synthetic4.csv')
f = pd.read_csv('DatGan_synthetic5.csv')

  a = pd.read_csv('DatGan_synthetic.csv')
  b = pd.read_csv('DatGan_synthetic1.csv')
  c = pd.read_csv('DatGan_synthetic2.csv')
  d = pd.read_csv('DatGan_synthetic3.csv')
  e = pd.read_csv('DatGan_synthetic4.csv')
  f = pd.read_csv('DatGan_synthetic5.csv')


In [19]:
combined_df = pd.concat([a, b, c, d, e, f], ignore_index=True)
combined_df = combined_df.reset_index(drop=True)


In [33]:
combined_df.to_csv('DatGan_Population.csv', index=False)