In [None]:
import pandas as pd;
import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from datetime import datetime
from sklearn.model_selection import cross_val_score

import numpy as np

import pathlib
import os

from sdv.metadata import Metadata

from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import get_column_pair_plot

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sdmetrics.column_pairs.statistical.contingency_similarity")


In [None]:
#Reading in simulated data
def readSimulatedData(folder,dependence,path):
    path = f"{path}/{folder}/"
    dtypes = {'countryCode': 'category', 'Language': 'category', 'gender': 'category', 'pilStatus': 'category'}
    if folder=="Simulated Data":
        for row in rows:
            for missing in missings:
                file_name = f"{row}_obs_{missing}_percent_missing.csv"
                name=f"data_{row}_{missing}"
                globals()[name]= pd.read_csv(path + file_name, dtype=dtypes)



In [None]:
#Reading in synthetic data
def readSyntheticData(dependence,folder_path):
    synthetic_types=['GC','CTGAN','TVAE']

    path = f"{folder_path}/{dependence}/Synthetic Datasets/"
    dtypes = {'countryCode': 'category', 'Language': 'category', 'gender': 'category', 'pilStatus': 'category'}
    if dependence=="Synthetic Data":
        for row in rows:
            for missing in missings:
                    for synthetic_type in synthetic_types:
                        file_synthetic_type=str.lower(synthetic_type) 
                        file_name = f"{row} rows_synthetic_{file_synthetic_type}.csv"
                        name=f"{synthetic_type}_data_{row}_{missing}"
        
                        folder_name=f"{row} row {missing} missing/{synthetic_type}/"
                        globals()[name]= pd.read_csv(path + folder_name + file_name,dtype=dtypes)


In [None]:
#Creating lists for 0%, 10%, 20% missing data (Simulated Non synthetic data)
def simulated_lists(dependences,rows,missings):
    for dependence in dependences:
        for row in rows:
            name=f"{dependence}_{row}"
            globals()[name]=[]
            for missing in missings:
                    if dependence=="data":
                        variable=f'data_{row}_{missing}'
                        globals()[name].append(globals()[variable])
                    else:
                        variable=f'{dependence}_data_{row}_{missing}'
                        globals()[name].append(globals()[variable])

In [None]:
#List of synthetic data sets. Prints the name of the lists. 0 = 0%, 10 = 10%, 20 = 20%
def synthesizer_lists(synthesizers,dependences,rows,missings):
    for synthesizer in synthesizers:
        for dependence in dependences:
            if dependence=="data":
                for row in rows:
                    name=f"{synthesizer}_{row}_{dependence}"
                    globals()[name]=[]
                    
                    for missing in missings:
                        variable=f'{synthesizer}_data_{row}_{missing}'
                        globals()[name].append(globals()[variable])
            else:
                for row in rows:
                    name=f"{synthesizer}_{row}_{dependence}"
                    globals()[name]=[]
                    for missing in missings:
                        variable=f'{synthesizer}_{dependence}_data_{row}_{missing}'
                        globals()[name].append(globals()[variable])

In [None]:
#Merging the lists of data frames for each synthesizer/simulated data type. All synthesizer dataframes in one list
def mergeList(list1,list2,list3):
    merged_list = list1+list2+list3
    return merged_list

In [None]:
# Creating metadata for synthesizers
def createMetadata(df_list):
    metadata_list=[]
    for df in df_list:
        metadata = Metadata.detect_from_dataframe(data=df)
        metadata.update_column(column_name="arcsId", sdtype="id")
        metadata.update_column(column_name="hashedId", sdtype="id")
        metadata.update_column(column_name="countryCode",sdtype="categorical")
        metadata.update_column(column_name="Language",sdtype="categorical")
        metadata_list.append(metadata)
        
    return metadata_list

In [None]:
# Creating age feature from date of birth
def age(df):
    today=datetime.today()
    df['dateOfBirth']=pd.to_datetime(df['dateOfBirth'])
    df['age']=df['dateOfBirth'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

In [None]:
# Preprocessing datatypes
def preprocessing(df_list):
    categorical_cols=['countryCode','gender','pilStatus','Language']
    for df in df_list:
        df.dropna(axis=0,inplace=True)
        df[categorical_cols]=df[categorical_cols].astype('category')


In [None]:
# Creating reports for KSComplement and TVComplement
def make_report(sim,syn,meta):
    diagnostic = run_diagnostic(
    real_data=sim,
    synthetic_data=syn,
    metadata=meta
    )
    quality_report = evaluate_quality(
    real_data=sim,
    synthetic_data=syn,
    metadata=meta
    )
    return (diagnostic,quality_report)

In [None]:
# Looping for reports for all datasets
def report_loop(sim,syn,meta):
    report_list=[]
    for i,df in enumerate(sim):
        report=make_report(sim[i],syn[i],meta[i])
        report_list.append(report)
    return report_list



In [None]:
# Extracting column metrics
def get_column_metrics(report_list,Synthesizer_type,df_type):
    df = [] 
    for i,item in enumerate(report_list):   
        columns_scores= item[1].get_properties()
        columns_scores['df_type']=df_type[i]
        df.append(columns_scores)
    df=pd.concat(df, axis=0, ignore_index=True)

    df['synthesizer_type']=Synthesizer_type
    return df
        
    

In [None]:
# Extracting marginal metrics
def get_marginal_metrics(report_list,Synthesizer_type,df_type):
    df = [] 
    for i,item in enumerate(report_list):   
        columns_scores= item[1].get_details('Column Shapes')
        columns_scores['df_type']=df_type[i]
        df.append(columns_scores)
    df=pd.concat(df, axis=0, ignore_index=True)

    df['synthesizer_type']=Synthesizer_type
  
    return df

In [None]:
# Setting paths
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
parent_dir=script_dir.parent

# Lists for dataframe sample sizes and missing data
df_type = ['10000 row 0 missing','10000 row 10 missing','10000 row 20 missing',
             '25000 row 0 missing','25000 row 10 missing','25000 row 20 missing',
             '50000 row 0 missing','50000 row 10 missing','50000 row 20 missing']
rows = [10000, 25000, 50000]
missings = [0, 10, 20]

#Reading in data
readSimulatedData("Simulated Data","simulated_data",parent_dir)
readSyntheticData("Synthetic Data",parent_dir)

#Putting all simulated and synthetic datasets into lists for easier processing
simulated_lists(["data"],["10000","25000","50000"],["0","10","20"])
synthesizer_lists(["GC","CTGAN","TVAE"],["data"],["10000","25000","50000"],["0","10","20"])
Simulated_data=mergeList(data_10000, data_25000, data_50000)
GC_data=mergeList(GC_10000_data,GC_25000_data,GC_50000_data)
CTGAN_data=mergeList(CTGAN_10000_data,CTGAN_25000_data,CTGAN_50000_data)
TVAE_data=mergeList(TVAE_10000_data,TVAE_25000_data,TVAE_50000_data)

# Creating metadata for column metrics
metadata=createMetadata(Simulated_data)

#Preprocessing lists of dataframes
preprocessing(Simulated_data)
preprocessing(GC_data)
preprocessing(CTGAN_data)
preprocessing(TVAE_data)

#Dictionary for easier processing of column metric and marginal metric reports
all_dataset={
    'Simulated':Simulated_data,
    'GC':GC_data,
    'CTGAN':CTGAN_data,
    'TVAE':TVAE_data,
    'Metadata':metadata,
}

# Running all column and marginal metrics for each dataset
gc_report_list=report_loop(all_dataset['Simulated'],all_dataset['GC'],all_dataset['Metadata'])
ctgan_report_list=report_loop(all_dataset['Simulated'],all_dataset['CTGAN'],all_dataset['Metadata'])
tvae_report_list=report_loop(all_dataset['Simulated'],all_dataset['TVAE'],all_dataset['Metadata'])

# Extracting column scores
gc_columnsScores=get_column_metrics(gc_report_list,"GC",df_type)
ctgan_columnsScores=get_column_metrics(ctgan_report_list,"CTGAN",df_type)
tvae_columnsScores=get_column_metrics(tvae_report_list,"TVAE",df_type)
columnScores=pd.concat([gc_columnsScores,ctgan_columnsScores,tvae_columnsScores],axis=0,ignore_index=True)

# Extracting marginal scores
gc_marginalScores=get_marginal_metrics(gc_report_list,"GC",df_type)
ctgan_marginalScores=get_marginal_metrics(ctgan_report_list,"CTGAN",df_type)
tvae_marginalScores=get_marginal_metrics(tvae_report_list,"TVAE",df_type)
marginalScores=pd.concat([gc_marginalScores,ctgan_marginalScores,tvae_marginalScores],axis=0,ignore_index=True)

#Saving data
marginalScores.to_csv('marginalScores1.csv',index=False)
columnScores.to_csv('columnScores1.csv',index=False)