In [1]:
import pandas as pd
import re

In [2]:
# Reading in data of different metrics based off dependency type with different paths for different association strengths
def readData(dependency):
    # Setting paths for different dependency strengths
    if dependency=="simulated":
        multiple_iters_path=''
        simulated_path='../'
    
    elif dependency=="high":
        multiple_iters_path='../../Simulating Data/Dependency/Synthetic Data Multiple Iterations/High/'
        simulated_path='../../Simulating Data/Dependency/Dependency Synthetic Data/High/'

    elif dependency=="moderate":
        multiple_iters_path='../../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Moderate/'
        simulated_path='../../Simulating Data/Dependency/Dependency Synthetic Data/Moderate/'

    else:
        multiple_iters_path='../../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Low/'
        simulated_path='../../Simulating Data/Dependency/Dependency Synthetic Data/Low/'

    # Reading in csvs for multiple synthetic datasets' Cramers, MAE, and correlation values
    cramers_df=pd.read_csv(f'{multiple_iters_path}cramers_multiple_iters.csv')
    mae_df=pd.read_csv(f'{multiple_iters_path}mae_multiple_iters.csv')
    corr_df=pd.read_csv(f'{multiple_iters_path}corr_multiple_iters.csv')

    # Reading in csvs for simulated dataset's Cramers, MAE, and correlation values
    cramers_simulated=pd.read_csv(f'{simulated_path}/simulated_cramers.csv')
    mae_simulated=pd.read_csv(f'{simulated_path}/simulated_MAE.csv')
    corr_simulated=pd.read_csv(f'{simulated_path}/simulated_corr.csv')
    
    return cramers_df,mae_df,corr_df,cramers_simulated,mae_simulated,corr_simulated,corr_simulated,dependency

In [3]:
# Saving metrics for visualization in Tableau
def save_metrics(dependency,cramers,mae,corr):

    # Setting save path based off dependency type
    if dependency=="simulated":
        save_path=''

    elif dependency=="high":
        save_path='../../Simulating Data/Dependency/Synthetic Data Multiple Iterations/High/'

    elif dependency=="moderate":
        save_path='../../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Moderate/'

    else:
        save_path='../../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Low/'

    # Saving joined metrics to defined path
    cramers.to_csv(f'{save_path}cramers_joined1.csv',index=False)
    mae.to_csv(f'{save_path}mae_joined1.csv',index=False)
    corr.to_csv(f'{save_path}corr_joined1.csv',index=False)

In [4]:
# Main function for merging files based off different association strengthsd
def main(dependence):

    #Reading data
    cramers_df,mae_df,corr_df,cramers_simulated,mae_simulated,corr_simulated,corr_simulated,dependency=readData(dependence)

    # Melting metrics from multiple synthetic datasets
    cramers=pd.melt(cramers_df)
    mae=pd.melt(mae_df)
    corr=pd.melt(corr_df)
    
    #Renaming columns
    cramers_simulated.columns=['variable','Simulated']
    mae_simulated.columns=['variable','Simulated']
    corr_simulated.columns=['variable','Simulated']

    # Getting row and missing from variable feature
    split=corr_simulated['variable'].str.split('-')
    rows = []
    missings = []
    for elem in split:
        row=re.findall(r'\d+',elem[0])
        missing=re.findall(r'\d+',elem[1])
        rows.append(row)
        missings.append(missing)

    # Creating a dataframe with rows and missing string columns    
    data=pd.DataFrame({
        'Row': rows,
        'Missing': missings
    })
    data['Row']=data['Row'].explode()
    data['Missing']=data['Missing'].explode()
    data['Variable']=data['Row']+'_'+data['Missing']
    data.drop(['Row','Missing'],inplace=True,axis=1)
    
    # Adding row and missing columns to simulated data
    cramers_simulated=pd.concat([data,cramers_simulated],axis=1)
    cramers_simulated.drop('variable',axis=1,inplace=True)

    mae_simulated=pd.concat([data,mae_simulated],axis=1)
    mae_simulated.drop('variable',axis=1,inplace=True)

    corr_simulated=pd.concat([data,corr_simulated],axis=1)
    corr_simulated.drop('variable',axis=1,inplace=True)

    # Extracting synthesizer type as it's own column
    cramers['type']=cramers['variable'].apply(lambda x: '_'.join(x.split('_')[1:]))
    mae['type']=mae['variable'].apply(lambda x: '_'.join(x.split('_')[1:]))
    corr['type']=corr['variable'].apply(lambda x: '_'.join(x.split('_')[1:]))
    
    # Merging based on variable name and extracting only relevant columns
    cramers=cramers.merge(cramers_simulated,how='inner',left_on='type',right_on='Variable')
    cramers=cramers[['variable','value','Simulated']]

    mae=mae.merge(mae_simulated,how='inner',left_on='type',right_on='Variable')
    mae=mae[['variable','value','Simulated']]

    corr=corr.merge(corr_simulated,how='inner',left_on='type',right_on='Variable')
    corr=corr[['variable','value','Simulated']]

    save_metrics(dependency,cramers,mae,corr)

In [None]:
main("simulated")
main("high")
main("moderate")
main("low")