In [None]:
import json
import os
import pandas as pd
import networkx as nx
import numpy as np
import datgan


# DEFINE DATGAN VERSION

### Set Foldername variable
### Write ReadMe of run parameters

In [None]:
DatGan_path = '/home/s212574/snap/snapd-desktop-integration/83/Documents/Thesis/MSc_PopSyn/Sigga_Luis/Data/'
os.chdir(DatGan_path)

# folder name
folder_name = 'FolderName'          # Change this to the name of the folder you want to create
ciDatGan_folder = 'ciDatGan'        # Create a folder for the ciDatGan data
datGan_folder = 'DatGan'            # Create a folder for the DatGan data


# Check if the folder doesn't exist, then create it
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"folder '{folder_name}' created successfully.")
else:
    print(f"folder '{folder_name}' already exists.")
    
if not os.path.exists(folder_name + '/' + ciDatGan_folder):
    os.makedirs(folder_name + '/' + ciDatGan_folder)
    print(f"folder '{ciDatGan_folder}' created successfully.")
else:
    print(f"folder '{folder_name}' already exists.")

if not os.path.exists(folder_name + '/' + datGan_folder):
    os.makedirs(folder_name + '/' + datGan_folder)
    print(f"folder '{datGan_folder}' created successfully.")
else:
    print(f"folder '{folder_name}' already exists.")


In [None]:
df_dtypes = {
    'CarModelYear': 'category',
    'Education': 'category',
    'FamNumAdults': 'category',
    'FamNumPers': 'category',
    'FuelType': 'category',
    'Gender': 'category',
    'Handicap': 'category',
    'HomeAdrZone': 'category',
    'HousehCarOwnership': 'category',
    'HousehNumAdults': 'category',
    'HousehNumPers': 'category',
    'HousehNumcars': 'category',
    'HwDayspW': 'category',
    'IncFamily2000': 'int64',
    'IncRespondent2000': 'int64',
    'KidsBetween0and4': 'category',
    'KidsBetween0and15': 'category',
    'MunicipalityDest': 'category',
    'MunicipalityOrigin': 'category',
    'PopSocio': 'category',
    'PrimOccZone': 'category',
    'RespAgeCorrect': 'int64',
    'RespHasBicycle': 'category',
    'RespHasRejsekort': 'category',
    'RespIsmemCarshare': 'category',
    'RespPrimOcc': 'category',
    'ResphasDrivlic': 'category',
    'Sector': 'category',
    'WorkHourType': 'category',
    'WorkHoursPw': 'float64',
    'Year': 'category',
    'HomeParkPoss': 'category',
    'HousehAccomodation': 'category',
    'HousehAccOwnorRent': 'category',
    'PosInFamily': 'category',
    'PrimModeDay': 'category',
    'ModeChainTypeDay': 'category',
    'RespHasSeasonticket': 'category'
}

In [None]:
os.chdir(DatGan_path + folder_name)
# File path where you want to save the dictionary as a JSON file
file_path = 'df_dtypes.json'

# Save the dictionary as a JSON file
with open(file_path, 'w') as file:
    json.dump(df_dtypes, file)

In [None]:
df = pd.read_csv('simulationData_withNewCat.csv', sep=',',dtype=df_dtypes)
df.drop(columns=['SessionId'], inplace=True)


# Select numerical columns excluding categorical
numerical_columns = df.select_dtypes(exclude='category').columns.tolist()

# Calculate ranges for numerical columns
ranges = {}
for col in numerical_columns:
    col_range = {
        'min': df[col].min(),
        'max': df[col].max(),
        'range': np.ptp(df[col])  # Peak-to-Peak (max - min)
    }
    ranges[col] = col_range

# Format and print ranges in rows
for col, info in ranges.items():
    print(f"Column: {col}")
    print(f"Min: {info['min']} | Max: {info['max']} | Range: {info['range']}\n")

In [None]:
data_info = {
        'WorkHoursPw': {
            'type': 'continuous',
            'bounds': [0.0, 168.0],  # Min: 0.0 | Max: 168.0 | Range: 168.0
            'discrete': False
        },
        'IncRespondent2000': {
            'type': 'continuous',
            'bounds': [0, 10000],  # Min: 0 | Max: 70640 | Range: 70640
            'discrete': True
        },
        'IncFamily2000': {
            'type': 'continuous',
            'bounds': [0, 10000],  # Min: 0 | Max: 41358 | Range: 41358
            'discrete': True
        },
        'RespAgeCorrect': {
            'type': 'continuous',
            'bounds': [5, 107],  # Min: 5 | Max: 107 | Range: 102
            'discrete': True
        },
    }

In [None]:
# For the categorical columns, we can simply add them using a for loop
for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

# Convert columns in df according to data_info dictionary
for column, info in data_info.items():
    col_type = info['type']
        
    if col_type == 'categorical':
        df[column] = df[column].astype('category')  # Convert to categorical dtype

# Check the updated data types of columns in df
print(df.dtypes)

In [None]:
os.chdir(DatGan_path + folder_name)

# File path where you want to save the dictionary as a JSON file
file_path = 'data_info.json'

# Save the dictionary as a JSON file
with open(file_path, 'w') as file:
    json.dump(data_info, file)

In [None]:
relations = [("CarModelYear", "FuelType"),
    ("Education", "IncRespondent2000"),
    ("Education", "Sector"),
    ("FamNumPers", "FamNumAdults"),
    ("FamNumPers", "KidsBetween0and15"),
    ("Gender", "Education"),
    ("Gender", "IncRespondent2000"),
    ("HomeAdrZone", "HousehAccomodation"),
    ("HousehAccomodation", "HomeParkPoss"),
    ("HousehAccomodation", "HousehAccOwnorRent"),
    ("HousehAccomodation", "HousehNumPers"),
    ("HousehCarOwnership", "CarModelYear"),
    ("HousehNumAdults", "FamNumAdults"),
    ("HousehNumPers", "FamNumPers"),
    ("HousehNumPers", "HousehNumAdults"),
    ("HousehNumPers", "HousehNumcars"),
    ("HousehNumcars", "CarModelYear"),
    ("HousehNumcars", "HousehCarOwnership"),
    ("HwDayspW", "PrimModeDay"),
    ("IncFamily2000", "HousehAccOwnorRent"),
    ("IncFamily2000", "HousehCarOwnership"),
    ("IncFamily2000", "IncRespondent2000"),
    ("IncRespondent2000", "WorkHourType"),
    ("KidsBetween0and15", "KidsBetween0and4"),
    ("MunicipalityDest", "PrimModeDay"),
    ("MunicipalityDest", "PrimOccZone"),
    ("MunicipalityOrigin", "HomeAdrZone"),
    ("MunicipalityOrigin", "Sector"),
    ("PopSocio", "Education"),
    ("PopSocio", "MunicipalityDest"),
    ("PopSocio", "RespPrimOcc"),
    ("PrimModeDay", "ModeChainTypeDay"),
    ("PrimModeDay", "RespHasBicycle"),
    ("PrimModeDay", "RespHasRejsekort"),
    ("PrimModeDay", "ResphasDrivlic"),
    ("RespAgeCorrect", "Education"),
    ("RespAgeCorrect", "PopSocio"),
    ("RespAgeCorrect", "PosInFamily"),
    ("RespAgeCorrect", "ResphasDrivlic"),
    ("RespHasRejsekort", "RespHasSeasonticket"),
    ("RespPrimOcc", "IncRespondent2000"),
    ("ResphasDrivlic", "RespIsmemCarshare"),
    ("WorkHourType", "HwDayspW"),
    ("WorkHourType", "WorkHoursPw")
]


In [None]:
os.chdir(DatGan_path + folder_name)

# Convert the list of tuples to a JSON-serializable format
serialized_relations = [list(rel) for rel in relations]

# Save the data to a JSON file
file_path = 'relations.json'  # Define the file path
with open(file_path, 'w') as file:
    json.dump(serialized_relations, file)

In [None]:
os.chdir(DatGan_path + folder_name)

graph = nx.DiGraph()
graph.add_edges_from(relations)
graph.add_node("Year")
graph.add_node("Handicap")
nx.write_adjlist(graph, "graph.adjlist")

In [None]:
os.chdir(DatGan_path)
os.chdir(folder_name)

train_set = pd.read_csv('trainData.csv', sep=',', dtype=df_dtypes) # Read the training set
train_set.drop(train_set.columns[train_set.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

test_set = pd.read_csv('testData.csv', sep=',', dtype=df_dtypes) # Read the testing set
test_set.drop(test_set.columns[test_set.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

# Check the shapes of your sets
print("Training set shape:", train_set.shape)
print("Testing set shape:", test_set.shape)

In [None]:
# For the categorical columns, we can simply add them using a for loop
for c in train_set.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

# Convert columns in df according to data_info dictionary
for column, info in data_info.items():
    col_type = info['type']
        
    if col_type == 'categorical':
        train_set[column] = train_set[column].astype('category')  # Convert to categorical dtype

# Check the updated data types of columns in df
print(train_set.dtypes)

In [None]:
### DatGan Preprocessing of DATA input
from datgan import DATGAN

os.chdir(os.path.join(DatGan_path, folder_name, 'DatGan'))

batch_size = 1116

datgan = DATGAN(output='./output/',
                batch_size=batch_size,
                num_epochs=1000)

datgan.preprocess(data=train_set,metadata=data_info,preprocessed_data_path='./encoded_data/')

In [None]:
### ciDatGan Preprocessing of DATA input
from datgan import DATGAN

os.chdir(os.path.join(DatGan_path, folder_name, 'ciDatGan'))

batch_size = 1116

conditional_inputs = ['Gender', 'Education', 'MunicipalityOrigin']

ciDatGan = DATGAN(output='./output/',
                batch_size=batch_size,
                num_epochs=1000,
                conditional_inputs=conditional_inputs)

ciDatGan.preprocess(train_set, data_info, preprocessed_data_path='./encoded_data/')
