In [3]:
import pandas as pd
import os

In [4]:
# Set the directory path
base_dir = os.getcwd()
dir_path = os.path.join(base_dir, '..', 'Data_analysis', 'generatedData-CFA')
outdir = os.path.join(base_dir, '..', 'Data_Analysis', 'CFA')

if not os.path.exists(outdir):
    os.makedirs(outdir)
#list of fit indices we want to report in the paper
most_important_indices = [
        'chisq',
        'pvalue',
        'cfi',
        'tli',
        'srmr',
        'rmsea',
    ]

# Model fit indices

In [5]:
file_pattern = 'fit_indices-Model' #pattern to find

# List to store individual DataFrames
dfs = []

# Iterate through the files in the directory
for file in os.listdir(dir_path):
    if file.startswith(file_pattern) and file.endswith('.csv'):
        # Load each CSV file as a DataFrame
        df = pd.read_csv(os.path.join(dir_path, file))
        
        # Rename the column "Unnamed: 0" to "fit indices"
        df.rename(columns={'Unnamed: 0': 'fit indices'}, inplace=True)
        
        # Set "fit indices" as the index
        df.set_index('fit indices', inplace=True)
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
result_df = pd.concat(dfs, axis=1)

# Set display options for float formatting
pd.set_option('display.float_format', '{:.6f}'.format)

# Print or use the concatenated DataFrame (df)
result_df


Unnamed: 0_level_0,Model_1
fit indices,Unnamed: 1_level_1
npar,234.0
fmin,0.135925
chisq,477.911645
df,228.0
pvalue,0.0
baseline.chisq,11601.827257
baseline.df,330.0
baseline.pvalue,0.0
cfi,0.977829
tli,0.96791


In [6]:

higher_is_better = {
    'npar': False,  # Lower is better (number of parameters)
    'fmin': False,  # Lower is better (minimum function value)
    'chisq': False,  # Lower is better (chi-square statistic)
    'df': True,  # Higher is better (degrees of freedom)
    'pvalue': True,  # Higher is better (p-value)
    'baseline.chisq': False,  # Lower is better (baseline chi-square)
    'baseline.df': True,  # Higher is better (baseline degrees of freedom)
    'baseline.pvalue': True,  # Higher is better (baseline p-value)
    'cfi': True,  # Higher is better (comparative fit index)
    'tli': True,  # Higher is better (Tucker-Lewis index)
    'cfi.robust': True,  # Higher is better (robust comparative fit index)
    'tli.robust': True,  # Higher is better (robust Tucker-Lewis index)
    'nnfi': True,  # Higher is better (non-normed fit index)
    'rfi': True,  # Higher is better (relative fit index)
    'nfi': True,  # Higher is better (normed fit index)
    'pnfi': True,  # Higher is better (parsimony normed fit index)
    'ifi': True,  # Higher is better (incremental fit index)
    'rni': True,  # Higher is better (relative noncentrality index)
    'nnfi.robust': True,  # Higher is better (robust non-normed fit index)
    'rni.robust': True,  # Higher is better (robust relative noncentrality index)
    'logl': True,  # Higher is better (log-likelihood)
    'unrestricted.logl': True,  # Higher is better (unrestricted log-likelihood)
    'aic': False,  # Lower is better (Akaike Information Criterion)
    'bic': False,  # Lower is better (Bayesian Information Criterion)
    'ntotal': True,  # Higher is better (total number of observations)
    'bic2': False,  # Lower is better (alternative Bayesian Information Criterion)
    'rmsea': False,  # Lower is better (Root Mean Square Error of Approximation)
    'rmsea.ci.lower': False,  # Lower is better (RMSEA lower confidence interval)
    'rmsea.ci.upper': False,  # Lower is better (RMSEA upper confidence interval)
    'rmsea.ci.level': True,  # Higher is better (RMSEA confidence interval level)
    'rmsea.pvalue': True,  # Higher is better (RMSEA p-value)
    'rmsea.close.h0': True,  # Higher is better (RMSEA close fit null hypothesis)
    'rmsea.notclose.pvalue': True,  # Higher is better (RMSEA not close fit p-value)
    'rmsea.notclose.h0': True,  # Higher is better (RMSEA not close fit null hypothesis)
    'rmsea.robust': False,  # Lower is better (robust RMSEA)
    'rmsea.ci.lower.robust': False,  # Lower is better (robust RMSEA lower confidence interval)
    'rmsea.ci.upper.robust': False,  # Lower is better (robust RMSEA upper confidence interval)
    'rmsea.pvalue.robust': True,  # Higher is better (robust RMSEA p-value)
    'rmsea.notclose.pvalue.robust': True,  # Higher is better (robust RMSEA not close fit p-value)
    'rmr': False,  # Lower is better (Root Mean Square Residual)
    'rmr_nomean': False,  # Lower is better (Root Mean Square Residual without mean)
    'srmr': False,  # Lower is better (Standardized Root Mean Square Residual)
    'srmr_bentler': False,  # Lower is better (Bentler's Comparative Fit Index)
    'srmr_bentler_nomean': False,  # Lower is better (Bentler's Comparative Fit Index without mean)
    'crmr': False,  # Lower is better (Categorical Root Mean Square Residual)
    'crmr_nomean': False,  # Lower is better (Categorical Root Mean Square Residual without mean)
    'srmr_mplus': False,  # Lower is better (Mplus version of SRMR)
    'srmr_mplus_nomean': False,  # Lower is better (Mplus version of SRMR without mean)
    'cn_05': True,  # Higher is better (critical N (p < 0.05))
    'cn_01': True,  # Higher is better (critical N (p < 0.01))
    'gfi': True,  # Higher is better (Goodness of Fit Index)
    'agfi': True,  # Higher is better (Adjusted Goodness of Fit Index)
    'pgfi': True,  # Higher is better (Parsimony Goodness of Fit Index)
    'mfi': True,  # Higher is better (McDonald's Fit Index)
    'ecvi': False  # Lower is better (Expected Cross-Validation Index)
}

def rank_fit_indices(df, higher_is_better, most_important_indices):
    # Create an empty DataFrame to store the rankings
    rank_df = pd.DataFrame(index=df.index, columns=df.columns)

    # Create a summary DataFrame to store the most important rankings
    best_df = pd.DataFrame(index=df.index, columns=['Most_Important_Ranking'])

    # Define the nature of fit indices (higher or lower is better)
    # True means higher values are better, False means lower values are better

    # Iterate through each fit index
    for fit_index in df.index:
        # Iterate through each model column and rank the fit index
        ranks = df.loc[fit_index].rank(ascending=not higher_is_better.get(fit_index, True)).astype(int)

        # Assign the rank to the rank DataFrame
        rank_df.loc[fit_index] = ranks

        # Find the most important rank and assign it to the summary DataFrame
        summary_df = df.loc[most_important_indices]
        most_important_rank = ranks.idxmin()
        summary_rank_df = rank_df.loc[most_important_indices]
        best_df.at[fit_index, 'Most_Important_Ranking'] = most_important_rank
        summary_best_df = best_df.loc[most_important_indices]

    return summary_df, rank_df, summary_rank_df, best_df, summary_best_df

# Example usage:
# result_df is your DataFrame with fit indices
summary_df, rank_df, summary_rank_df, best_df, summary_best_df = rank_fit_indices(result_df, higher_is_better, most_important_indices)

summary_rank_df

Unnamed: 0_level_0,Model_1
fit indices,Unnamed: 1_level_1
chisq,1
pvalue,1
cfi,1
tli,1
srmr,1
rmsea,1


In [7]:
summary_df.to_csv(outdir+'/models_fit_comparison.csv', float_format='%.3f')
summary_df.to_latex(outdir+'/models_fit_comparison.tex', float_format='%.3f')
summary_df

Unnamed: 0_level_0,Model_1
fit indices,Unnamed: 1_level_1
chisq,477.911645
pvalue,0.0
cfi,0.977829
tli,0.96791
srmr,0.034637
rmsea,0.061163


In [8]:
summary_rank_df.to_csv(outdir+'/models_fit_ranking_comparison.csv')
summary_rank_df

Unnamed: 0_level_0,Model_1
fit indices,Unnamed: 1_level_1
chisq,1
pvalue,1
cfi,1
tli,1
srmr,1
rmsea,1


# Reliability indicators

In [9]:
file_pattern = '-reliability.csv' #pattern to find
reliab_dir = f'{dir_path}/reliability'
input_csv_dir = f'{reliab_dir}/to aggregate/'

conditions = ['A','B','C','D','E','F','Full_survey']

scales = ['understand','layout','dataRead','dataFeat']

models = ['Model_1','Model_2','Model_3','Model_final', 'full_data']

# create a dict to hold dfs
dfs = {model: {scale: {} for scale in scales} for model in models}

# Iterate through the files in the directory
for file in os.listdir(input_csv_dir):
    file_name = file
    if file.endswith(file_pattern):
        # Load each CSV file as a DataFrame
        df = pd.read_csv(os.path.join(input_csv_dir, file))

        file_parts = []
        parts = file_name.split('-')
        parts = parts[:-1]  # drop reliability.csv
        parts = [part.strip() for part in parts] # remove spaces


        this_condition = 'Full_survey' #if no condition is found, it will default to this
        for part in parts:
            if part in conditions:
                this_condition = part
            elif part in scales:
                this_scale = part
            elif part in models:
                this_model = part
            else:
                print('Found a part that does not fit: ', part)

        #clean the df
        df = df.drop(columns=['Unnamed: 0']).T
        df = df.reset_index()
        df = df.rename(columns={
            0:this_condition,
            'index':'reliability_values'})
        df['Subscale'] = this_scale
                
        
        # Add the df at the corret location in the dict
        dfs[this_model][this_scale][this_condition] = df

# concat by model
model_dfs_dict = {}

for model in dfs.keys():
    model_dfs = []
    for scale in dfs[model].keys():
        for df in dfs[model][scale].values():
            model_dfs += [df]
    
    this_model_df = pd.concat(model_dfs, axis=0)
    this_model_df = this_model_df.pivot_table(index=['reliability_values', 'Subscale'], aggfunc='first')
    model_dfs_dict.update({model:this_model_df})
    this_model_df.to_csv(f'{reliab_dir}/{model}-reliabilities.csv')

    # keep only 'raw_alpha', 'std.alpha' for the latex export
    reliability_values_to_keep = ['raw_alpha', 'std.alpha', 'omega_tot']
    filtered_df = this_model_df[this_model_df.index.get_level_values('reliability_values').isin(reliability_values_to_keep)]
    filtered_df = filtered_df.rename(columns={
        'raw_alpha':'raw alpha',
        'std.alpha':'std alpha',
        'omega_tot':'omega'
    })
    filtered_df.to_latex(f'{reliab_dir}/{model}-reliabilities_short.tex', float_format='%.3f')
    filtered_df.to_csv(f'{reliab_dir}/{model}-reliabilities_short.csv', float_format='%.3f')
    

In [10]:
filtered_df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D,E,F,Full_survey
reliability_values,Subscale,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
omega_tot,dataFeat,0.870751,0.825889,0.843155,0.857878,0.811365,0.821392,0.879137
omega_tot,dataRead,0.86361,0.840205,0.85707,0.871988,0.889985,0.904232,0.925277
omega_tot,layout,0.825502,0.871456,0.858891,0.810676,0.832087,0.854564,0.908348
omega_tot,understand,0.919526,0.890903,0.913925,0.909551,0.933888,0.917588,0.951582
raw_alpha,dataFeat,0.868022,0.824137,0.835244,0.85863,0.81547,0.821552,0.878679
raw_alpha,dataRead,0.845049,0.836954,0.849643,0.859784,0.887835,0.902645,0.923072
raw_alpha,layout,0.816905,0.865409,0.852168,0.809268,0.829327,0.84016,0.907326
raw_alpha,understand,0.917422,0.882783,0.911682,0.909207,0.932787,0.916573,0.95018
std.alpha,dataFeat,0.868271,0.824821,0.835264,0.858766,0.81594,0.822566,0.878833
std.alpha,dataRead,0.86232,0.837913,0.854198,0.864872,0.886993,0.900176,0.922292


In [11]:
model_dfs_dict['Model_3']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D,E,F,Full_survey
reliability_values,Subscale,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
G6(smc),dataFeat,0.767208,0.701868,0.717127,0.752488,0.689103,0.69861,0.783855
G6(smc),dataRead,0.766777,0.729556,0.75087,0.817293,0.822413,0.86314,0.886897
G6(smc),layout,0.725136,0.792476,0.793379,0.716023,0.757041,0.821216,0.855835
G6(smc),understand,0.817933,0.810138,0.784006,0.775674,0.879591,0.790905,0.888693
S/N,dataFeat,6.591358,4.708442,5.070313,6.080427,4.433001,4.635911,7.253056
S/N,dataRead,4.731427,3.845678,4.465238,6.678062,6.551389,9.390447,11.491548
S/N,layout,3.424663,5.67187,5.125963,3.741193,4.379473,6.124619,8.223021
S/N,understand,6.579137,5.800045,4.894248,5.164509,10.655853,5.576338,11.841059
ase,dataFeat,0.015535,0.020668,0.019449,0.016074,0.021521,0.020765,0.005782
ase,dataRead,0.018417,0.021552,0.018723,0.012848,0.013298,0.009783,0.003325
