In [None]:
#Importing libraries
import pandas as pd;
import numpy as np
from datetime import datetime
import pathlib
import os
#For random forests and Cramer's V
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2_contingency
from sklearn.model_selection import cross_val_score

# For columns shape and marginal shape scores
from sdv.metadata import Metadata

from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sdmetrics.column_pairs.statistical.contingency_similarity")

In [None]:
#Reading in simulated data for different assocciation strengths
def readSimulatedData(folder,dependence,path):
    rows = [10000, 25000, 50000]
    missings = [0, 10, 20]
    path = f"{path}/{folder}/"
    dtypes = {'countryCode': 'category', 'Language': 'category', 'gender': 'category', 'pilStatus': 'category'}

    #Setting up prefixes for reading data with different association strengths
    if dependence=="simulated":
        prefix=""
    elif dependence=="high":
        prefix="high_"
    elif dependence=="moderate":
        prefix="moderate_"
    else:
        prefix="low_"

    for row in rows:
        for missing in missings:
            file_name = f"{prefix}{row}_obs_{missing}_percent_missing.csv"
            name=f"data_{row}_{missing}"
            globals()[name]= pd.read_csv(path + file_name, dtype=dtypes)
            
    return dependence

In [None]:
#Reading in synthetic data for different assocciation strengths
def readSyntheticData(dependence,folder_path):
    rows = [10000,25000, 50000]
    missings = [0, 10, 20]
    synthetic_types=['GC','CTGAN','TVAE']
    dtypes = {'countryCode': 'category', 'Language': 'category', 'gender': 'category', 'pilStatus': 'category'}

    # Setting up paths for different association strengths
    if dependence=="Synthetic Data":
        path = f"{folder_path}/{dependence}/Synthetic Datasets/"
    elif dependence=="high":
        path = f"{folder_path}/Simulating Data/Dependency/Dependency Synthetic Data/High/"
    elif dependence=="moderate":
         path = f"{folder_path}/Simulating Data/Dependency/Dependency Synthetic Data/Moderate/"
    else:
        path = f"{folder_path}/Simulating Data/Dependency/Dependency Synthetic Data/Low/"
      
    # For loop for reading in synthetic data from different paths
    for row in rows:
        for missing in missings:
                for synthetic_type in synthetic_types:
                    file_synthetic_type=str.lower(synthetic_type) 
                    file_name = f"{row} rows_synthetic_{file_synthetic_type}.csv"
                    name=f"{synthetic_type}_data_{row}_{missing}"
                    print(name)
                    folder_name=f"{row} row {missing} missing/{synthetic_type}/"
                    globals()[name]= pd.read_csv(path + folder_name + file_name,dtype=dtypes)

In [None]:
#Creating lists for 0%, 10%, 20% missing data simulated datasets
def simulated_lists(rows,missings):
    for row in rows:
        name=f"data_{row}"
        globals()[name]=[]
        for missing in missings:
            variable=f'data_{row}_{missing}'
            globals()[name].append(globals()[variable])
            

In [None]:
#List of synthetic data sets
def synthesizer_lists(synthesizers,rows,missings):
    for synthesizer in synthesizers:
        for row in rows:
            name=f"{synthesizer}_{row}_data"
            globals()[name]=[]
            for missing in missings:
                variable=f'{synthesizer}_data_{row}_{missing}'
                globals()[name].append(globals()[variable])
     

In [None]:
#Merging the lists of data frames for each synthesizer/simulated data type. All synthesizer dataframes in one list
def mergeList(list1,list2,list3):
    merged_list = list1+list2+list3
    return merged_list

In [None]:
#Creating meta data for column shape scores
def createMetadata(df_list):
    metadata_list=[]
    for df in df_list:
        #Auto detection of datatypes for features
        metadata = Metadata.detect_from_dataframe(data=df)

        #Manual enforcing of datatypes for features
        metadata.update_column(column_name="arcsId", sdtype="id")
        metadata.update_column(column_name="hashedId", sdtype="id")
        metadata.update_column(column_name="countryCode",sdtype="categorical")
        metadata.update_column(column_name="Language",sdtype="categorical")
        metadata_list.append(metadata)
        
    return metadata_list

In [None]:
# Creating age feature for random forests and correlation
def age(df):
    today=datetime.today()
    df['dateOfBirth']=pd.to_datetime(df['dateOfBirth'])
    df['age']=df['dateOfBirth'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

In [None]:
#Preprocessing to categorical
def preprocessing(df_list):
    categorical_cols=['countryCode','gender','pilStatus','Language']
    for df in df_list:
        df.dropna(axis=0,inplace=True)
        df[categorical_cols]=df[categorical_cols].astype('category')

In [None]:
# Correlation between age and rating
def correlation(data):
    return data['rating'].corr(data['age'])

In [None]:
# Creating custom Cramer's V calculation
def cramers_v(df):
    confusion_matrix = pd.crosstab(df['countryCode'],df['Language'])  
    chi2, p, dof, expected= chi2_contingency(confusion_matrix)  
    n = confusion_matrix.sum().sum()
    k = min(confusion_matrix.shape)  
    if k > 1:
        return np.sqrt(chi2 / (n * (k - 1)))
    else:
        return None


In [None]:
#Random forest predicting ratings with gender, age, language, countryCode, Language. Returns MAE.

def rf(data):
    X = data.drop(['rating', 'id', 'hashedId', 'arcsId', 'dateOfBirth', 'emailAddress', 'generation'], axis=1, errors='ignore')
    y = data['rating']

    numeric_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['category','object']).columns

    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    model = lgb.LGBMRegressor(
        boosting_type='rf',
        n_estimators=300,
        max_depth=15,
        subsample=0.8,
        subsample_freq=1,
        colsample_bytree=0.8,
        min_child_samples=5,
        device='cpu',
        random_state=123,
        n_jobs=-1
    )

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

    return -cv_scores.mean()

In [None]:
#Creating reports for marginal shape scores and column shape scores
def make_report(sim,syn,meta):
    diagnostic = run_diagnostic(
    real_data=sim,
    synthetic_data=syn,
    metadata=meta
    )
    quality_report = evaluate_quality(
    real_data=sim,
    synthetic_data=syn,
    metadata=meta
    )
    return (diagnostic,quality_report)

In [None]:
# Loop for getting column shape and marginal shape scores for all datasets
def report_loop(sim,syn,meta):
    report_list=[]
    for i,df in enumerate(sim):
        report=make_report(sim[i],syn[i],meta[i])
        report_list.append(report)
    return report_list

In [None]:
# Extracting column shape scores
def get_column_metrics(report_list,Synthesizer_type,df_type):
    df = [] 
    for i,item in enumerate(report_list):   
        columns_scores= item[1].get_properties()
        columns_scores['df_type']=df_type[i]
        df.append(columns_scores)
    df=pd.concat(df, axis=0, ignore_index=True)

    df['synthesizer_type']=Synthesizer_type
  
    return df

In [None]:
# Extracting marginal shape scores

def get_marginal_metrics(report_list,Synthesizer_type,df_type):

    df = [] 
    for i,item in enumerate(report_list):   
        columns_scores= item[1].get_details('Column Shapes')
        columns_scores['df_type']=df_type[i]
        df.append(columns_scores)
    df=pd.concat(df, axis=0, ignore_index=True)

    df['synthesizer_type']=Synthesizer_type
  
    return df

In [None]:
# Creating age feature
def preprocessing_age(df_list):
    categorical_cols=['countryCode','gender','pilStatus','Language']
    for df in df_list:
        age(df)
        df.dropna(axis=0,inplace=True)
        df[categorical_cols]=df[categorical_cols].astype('category')

In [None]:
# Loop for getting correlation out of all datasets
def correlationLoop(simulatedList,gcList,ctganList,tvaeList,row_names):

    col_names = [
        "Simulated", "GC", "CTGAN", "TVAE" 
    ]
    corr= pd.DataFrame(index=row_names, columns=col_names)
    for i, row_name in enumerate(row_names):
        corr.at[row_names[i], "Simulated"] = correlation(simulatedList[i])
        corr.at[row_names[i], "GC"] = correlation(gcList[i])
        corr.at[row_names[i], "CTGAN"] = correlation(ctganList[i])
        corr.at[row_names[i], "TVAE"] =correlation(tvaeList[i])
        
    return corr     

In [None]:
# Loop for getting Cramer's V out of all datasets
def cramersLoop(simulatedList,gcList,ctganList,tvaeList,row_names):

    col_names = [
        "Simulated", "GC", "CTGAN", "TVAE" 
    ]
    cramer= pd.DataFrame(index=row_names, columns=col_names)
    for i, row_name in enumerate(row_names):
        cramer.at[row_names[i], "Simulated"] = cramers_v(simulatedList[i])
        cramer.at[row_names[i], "GC"] = cramers_v(gcList[i])
        cramer.at[row_names[i], "CTGAN"] = cramers_v(ctganList[i])
        cramer.at[row_names[i], "TVAE"] =cramers_v(tvaeList[i])
        
    return cramer     

In [None]:
# Random forest loop on every dataset
def rfLoop(simulatedList,gcList,ctganList,tvaeList,row_names):
    col_names = [
        "Simulated", "GC", "CTGAN", "TVAE" 
    ]
    mae= pd.DataFrame(index=row_names, columns=col_names)
    for i, row_name in enumerate(row_names):
        mae.at[row_names[i], "Simulated"] = rf(simulatedList[i])
        mae.at[row_names[i], "GC"] = rf(gcList[i])
        mae.at[row_names[i], "CTGAN"] = rf(ctganList[i])
        mae.at[row_names[i], "TVAE"] =rf(tvaeList[i])
        
    return mae     

In [None]:
# Saving metrics to specified path
def saveMetrics(dependence,marginalScores,columnScores,cramers,corr,mae,simulated_cramers,simulated_corr,simulated_mae):
    # Setting path based of dependency type
    if dependence=="simulated":
        prefix_path=""
    elif dependence=="high":
        prefix_path=f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/High/'
    elif dependence=="moderate":
        prefix_path=f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/Moderate/'
    else:
        prefix_path=f'{parent_dir}/Simulating Data/Dependency/Dependency Synthetic Data/Low/'

    #Saving metrics
    marginalScores.to_csv(f'{prefix_path}marginalScores1.csv',index=False)
    columnScores.to_csv(f'{prefix_path}columnScores1.csv',index=False)
        
    cramers.to_csv(f'{prefix_path}cramers1.csv')
    corr.to_csv(f'{prefix_path}corr1.csv')
    mae.to_csv(f'{prefix_path}mae1.csv')

    simulated_cramers.to_csv(f'{prefix_path}simulated_cramers1.csv')
    simulated_corr.to_csv(f'{prefix_path}simulated_corr1.csv')
    simulated_mae.to_csv(f'{prefix_path}simulated_mae1.csv')

In [None]:
#Main function for each association strength type
def main(simulatedDataPath,dependency,syntheticDataDependency):
    #Lists used in functions
    rows = [10000, 25000, 50000]
    missings = [0, 10, 20]
    df_type = ['10000 row 0 missing','10000 row 10 missing','10000 row 20 missing',
             '25000 row 0 missing','25000 row 10 missing','25000 row 20 missing',
             '50000 row 0 missing','50000 row 10 missing','50000 row 20 missing']
    
    
    row_names = ["10000 rows - 0% Missing", "10000 rows - 10% Missing", "10000 rows - 20% Missing",
                "25000 rows - 0% Missing", "25000 rows - 10% Missing", "25000 rows - 20% Missing",
                "50000 rows - 0% Missing", "50000 rows - 10% Missing", "50000 rows - 20% Missing"]
    
    #Reading in simulated data and synthetic data
    dependence=readSimulatedData(simulatedDataPath,dependency,parent_dir)
    readSyntheticData(syntheticDataDependency,parent_dir)
    
    #Creating lists for synthetic and simulated data
    simulated_lists(["10000","25000","50000"],["0","10","20"])
    synthesizer_lists(["GC","CTGAN","TVAE"],["10000","25000","50000"],["0","10","20"])

    Simulated_data=mergeList(data_10000, data_25000, data_50000)
    GC_data=mergeList(GC_10000_data,GC_25000_data,GC_50000_data)
    CTGAN_data=mergeList(CTGAN_10000_data,CTGAN_25000_data,CTGAN_50000_data)
    TVAE_data=mergeList(TVAE_10000_data,TVAE_25000_data,TVAE_50000_data)

    #Creating metadata for column shape and marginal shape scores
    metadata=createMetadata(Simulated_data)
    
    #Preprocessing data to categorical
    preprocessing(Simulated_data)
    preprocessing(GC_data)
    preprocessing(CTGAN_data)
    preprocessing(TVAE_data)
    
    all_dataset={
        'Simulated':Simulated_data,
        'GC':GC_data,
        'CTGAN':CTGAN_data,
        'TVAE':TVAE_data,
        'Metadata':metadata,
    }

    #Running column score and marginal score methods
    gc_report_list=report_loop(all_dataset['Simulated'],all_dataset['GC'],all_dataset['Metadata'])
    ctgan_report_list=report_loop(all_dataset['Simulated'],all_dataset['CTGAN'],all_dataset['Metadata'])
    tvae_report_list=report_loop(all_dataset['Simulated'],all_dataset['TVAE'],all_dataset['Metadata'])

    #Extracting column metrics
    gc_columnsScores=get_column_metrics(gc_report_list,"GC",df_type)
    ctgan_columnsScores=get_column_metrics(ctgan_report_list,"CTGAN",df_type)
    tvae_columnsScores=get_column_metrics(tvae_report_list,"TVAE",df_type)
    columnScores=pd.concat([gc_columnsScores,ctgan_columnsScores,tvae_columnsScores],axis=0,ignore_index=True)

    #Extracting marginal distribution metrics
    gc_marginalScores=get_marginal_metrics(gc_report_list,"GC",df_type)
    ctgan_marginalScores=get_marginal_metrics(ctgan_report_list,"CTGAN",df_type)
    tvae_marginalScores=get_marginal_metrics(tvae_report_list,"TVAE",df_type)
    marginalScores=pd.concat([gc_marginalScores,ctgan_marginalScores,tvae_marginalScores],axis=0,ignore_index=True)

    #Preproceessing age feature
    preprocessing_age(Simulated_data) 
    preprocessing_age(GC_data)
    preprocessing_age(CTGAN_data)
    preprocessing_age(TVAE_data)

    # Running loops for correlation, cramer's v and random forests
    corr=correlationLoop(Simulated_data,GC_data,CTGAN_data,TVAE_data,row_names)
    cramers=cramersLoop(Simulated_data,GC_data,CTGAN_data,TVAE_data,row_names)
    mae=rfLoop(Simulated_data,GC_data,CTGAN_data,TVAE_data,row_names)

    #Extracting results for the simulated dataset
    simulated_corr=corr[['Simulated']]
    simulated_cramers=cramers[['Simulated']]
    simulated_mae=mae[['Simulated']]
    
    #Saving results
    saveMetrics(dependence,marginalScores,columnScores,cramers,corr,mae,simulated_cramers,simulated_corr,simulated_mae)


In [None]:
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
parent_dir=script_dir.parent

#Running main with different association strengths
main("Simulated Data","simulated","Synthetic Data")
main("Simulating Data/Dependency/DataWithRelations","high","high")
main("Simulating Data/Dependency/DataModerateRelations","moderate","moderate")
main("Simulating Data/Dependency/DataNoRelations","low","low")
