In [1]:
import pandas as pd
from collections import Counter
import math
import copy
import os

# Set variables

In [2]:
main_dir = '../Results/Data_Analysis/'
dir = main_dir + 'generatedData-EFA/'

#list of conditions
conditions = [
    'A',
    'B',
    'C',
    'D',
    'E',
    'F'
]

# Correlation threshold
corr_threshold = 0.5

# Factor loadings threshold
loading_threshold = 0.32

items = [
'answer',
'clearData',
'clearRepresent',
'complex',
'confid',
'confus',
'crowd',
'deciph',
'distinguish',
'distract',
'effect',
'find',
'identifi',
'inform',
'lost',
'meanElem',
'meanOveral',
'messi',
'obvious',
'organiz',
'read',
'readabl',
'represent',
'see',
'simpl',
'understandEasi',
'understandQuick',
'valu',
'visibl',
]

# Utils

In [3]:
# utils

#to load efa files - example of filename to match: dir/A/PA1 Stimuli 1 factors - A.csv
def import_EFA_file(stimuli, factors = 1, efa_method = "PA" ):
    filename_parts = [dir, stimuli, '/', efa_method, '1 Stimuli ', factors, ' factors - ', stimuli,'.csv']
    this_df = pd.read_csv(''.join(filename_parts))
    if 'items' in this_df.columns:
        this_df = this_df.rename(columns={'items':'item'}).set_index('item')
    elif "terms" in this_df.columns:
        this_df = this_df.rename(columns={'terms':'item'}).set_index('item')
    for col in list(this_df):
        this_df = this_df.rename(columns={col:f'{col} {stimuli}'})
        this_df = this_df.sort_values(by=list(this_df)[0], axis=0)
    return this_df

In [4]:
#This function will take the factor loadings
## remove loadings below the threshold
## for multiple factors, it will check cross loadings
## rank items within each factor

def set_of_observation(this_df, condition='Agg', efa_method='PA', fa_str='1'):
    col_names = [] #we will store the col names for iterating through the ranking colummns later
    this_df_factors = {}
    for col in this_df.columns: #we will iterate through each factor
        col_names.append(col)
        this_df.sort_values(by=col, axis=0, inplace=True, ascending=False) # we sort the df
        this_factor = pd.DataFrame(this_df[this_df[col]>loading_threshold][col].reset_index().reset_index().set_index('item')).rename(columns={'index':f'factor rank in {col}'}) #We make a df for this factor, taking only values above the threshold

        this_df_factors.update({
            f'{col[2]}': this_factor #the number is the third character of the col name
        })
    
    this_c_factor = pd.concat([df for df in this_df_factors.values()], axis=1) #We concat all dfs for this stimulus

    if len(fa_str)==1:
        fa_nb = int(fa_str)
    else:
        fa_nb = int(fa_str[0])

    #for analysis with more than 1 factor, we count crossloadings where an item loads above the threshold more than once, then we rewrite the ranks
    if fa_nb > 1: 
        crossloadings_dict = {}

        for item, row in this_c_factor.iterrows():
            row_slice = row[[col for col in this_c_factor if 'rank' not in col]]
            crossloadings = len(row_slice.dropna())-1 #since the last operation we have NaN where an item was lower than threshold. Cross-loadings occurs when items are above threshold more than once
            crossloadings_dict.update({
                item:crossloadings
            })
            max_value = row_slice.max()
            columns_with_value = this_c_factor.columns[this_c_factor.loc[item].isin([max_value])]
            if len(columns_with_value) == 1:
                item_factor = columns_with_value[0]
                this_c_factor.at[item, f'factor rank in {item_factor}'] = True #we set the ranking to True in order to recompile it afterwards
                for col in [col for col in this_c_factor if 'rank' in col and item_factor not in col]:
                    this_c_factor.at[item, col]=pd.NA
            elif len(columns_with_value) > 1:
                print(f"It's a tie between factors {columns_with_value} for item {item}")
            elif  len(columns_with_value) == 0:
                print(f'Something is wrong with item {item}')


        this_c_factor.insert(0,
                            f'Cross-loadings in {condition}',
                            this_c_factor.index.map(crossloadings_dict)) #we add crossloadings in the df as a new column
        
    # we calculate the rank for each factor
    cols_Factors = []
    for n in range(1,fa_nb+1, 1):
        cols_Factors.append(f'{efa_method}{str(n)} {condition}') #we generate all the list of all "FactorN condition" column names
    for col_prefix in cols_Factors:
        fa_col = f'{col_prefix}'
        rank_col = f'factor rank in {col_prefix}'
        
        # Ensure the Factor column is numeric before ranking
        #df[fa_col] = pd.to_numeric(df[fa_col], errors='coerce')

        # Filter rows where the factor rank column is True
        factor_rows = this_c_factor[this_c_factor[rank_col] == True]        
        # Rank the values in the Factor column and fill the factor rank column for selected rows
        this_c_factor.loc[factor_rows.index, rank_col] = factor_rows[fa_col].rank(ascending=False, method='min')

    # We create a new column for global ranks
    global_rank_col = f'Order in {condition}'
    this_c_factor[global_rank_col] = pd.NA
    global_rank_counter = 1 # Initialize a counter for global ranks

    # Iterate over the factor rank columns and update the global ranks
    for col_prefix in cols_Factors:  # Add more prefixes as needed
        rank_col = f'factor rank in {col_prefix}'
        
        # Update the global ranks for each factor rank column
        this_c_factor[global_rank_col] = this_c_factor.apply(lambda row: global_rank_counter + row[rank_col] - 1
                                    if not pd.isnull(row[rank_col]) else row[global_rank_col],
                                    axis=1)
        
        # Increment the counter for the next factor rank column
        global_rank_counter += len(this_c_factor) - this_c_factor[rank_col].isnull().sum()
    

        # this_c_factor.insert(0,
        #                 f'Order in {condition}',
        #                 c_order) #we put back the original order generated in R
            
    return (c,this_c_factor)

# just to make certain columns look prettier (otherwise int were converted to float because of the presence of NaNs)
def cols_to_int(df, list_of_cols, nan_value = 'NaN'):
    for col in list_of_cols:
        df[col] = df[col].astype('Int64').astype('object').fillna(nan_value) #Int64 supports Na, then we convert to object and we can replace them
    return df

# Individual stimulus EFA

In [5]:
# load data
EFA_df_dict = {}
factors_numbers = ['1','2','3','4']
for n in factors_numbers:
    EFA_df_dict.update({
        n:{}
    })
    for c in conditions:
        EFA_df_dict[n].update({
            c : import_EFA_file(c, factors = n)
        })

fa_loadings_dict = {
}

# Let's process and concat EFA results of each stimulus for each number of factors
for fa in factors_numbers:
    this_fa_factor = {}
    fa_nb = int(fa)

    for c in EFA_df_dict[fa].keys(): #for each EFA conducted (= condition = stimulus) with this number of factors
        this_df = EFA_df_dict[fa][c] #we take the df
        processed_c_df = set_of_observation(this_df, condition=c, fa_str=fa) #see in utils
        this_fa_factor.update({
            processed_c_df[0]:processed_c_df[1]
        })

    #let's concat the dfs
    this_factor_loadings = pd.concat([df for df in this_fa_factor.values()], axis=1)

    crossloadings_cols = [col for col in this_factor_loadings if 'Cross' in col]
    this_factor_loadings = cols_to_int(this_factor_loadings, crossloadings_cols, nan_value='Never above threshold')
    order_cols = [col for col in this_factor_loadings if 'Order' in col]
    this_factor_loadings = cols_to_int(this_factor_loadings, order_cols, nan_value='Missing')
    

    #organize df columns by alphabetical order = order of factors
    all_cols = sorted(list(this_factor_loadings))
    this_factor_loadings = this_factor_loadings[all_cols]

    #create a col with averages of ranks when items are in this factor for all conditions
    average_cols = []
    #create a col with count of presence across conditions
    presence_count_cols = []

    rank_cols = [col for col in all_cols if 'rank' in col]
    
    for PA_nb in range(1, fa_nb+1, 1): #we iterate through the PA numbers
        
        this_PA_rank_cols = [col for col in rank_cols if f'PA{PA_nb}' in col] #ranks for this PA nb only
        
        # information will be stored in dict and then trasnformed into columns
        this_PA_rank_averages = {}
        this_PA_presence_count = {}

        for item, row in this_factor_loadings.iterrows():
            relevant_row_slice = row[this_PA_rank_cols]
            if relevant_row_slice.isna().any(): #if there is any condition where this item isn't part of the factor > threshold
                average_rank = '' # then no rank
            else: #calculate item's average rank  and store it
                average_rank = relevant_row_slice.mean()
            this_PA_rank_averages.update({
                item:average_rank
            })
            slice_not_na = relevant_row_slice.dropna()
            this_PA_presence_count.update({
                item:len(slice_not_na)
            })
        
        #create the new cols from the dict
        av_col_name = f'average rank in PA{PA_nb}'
        count_col_name = f'presence count PA{PA_nb}'
        average_cols.append(av_col_name)
        this_factor_loadings[av_col_name] = this_factor_loadings.index.map(this_PA_rank_averages)
        presence_count_cols.append(count_col_name)
        this_factor_loadings[count_col_name] = this_factor_loadings.index.map(this_PA_presence_count)

    #reorganize the order of columns
    FL_cols = [col for col in all_cols if col not in rank_cols]
    order_cols = [col for col in all_cols if 'order' in col]
    new_cols = presence_count_cols + order_cols + FL_cols + average_cols + rank_cols 
    this_factor_loadings = this_factor_loadings[new_cols]

    #add that df to the dict of dfs
    fa_loadings_dict.update({
        fa:this_factor_loadings
    })

In [6]:
fa_loadings_dict['1']


Unnamed: 0_level_0,presence count PA1,Order in A,Order in B,Order in C,Order in D,Order in E,Order in F,PA1 A,PA1 B,PA1 C,PA1 D,PA1 E,PA1 F,average rank in PA1,factor rank in PA1 A,factor rank in PA1 B,factor rank in PA1 C,factor rank in PA1 D,factor rank in PA1 E,factor rank in PA1 F
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
represent,6,0,5,3,16,3,19,0.833902,0.788137,0.807013,0.7106,0.863986,0.701921,7.666667,0,5,3,16,3,19
understandEasi,6,1,0,4,14,0,9,0.832711,0.846734,0.80698,0.755662,0.892737,0.82691,4.666667,1,0,4,14,0,9
clearData,6,2,4,2,2,7,0,0.825475,0.807608,0.807661,0.826589,0.846426,0.866263,2.833333,2,4,2,2,7,0
identifi,6,3,12,11,6,9,3,0.818611,0.734023,0.755787,0.797509,0.826603,0.846472,7.333333,3,12,11,6,9,3
simpl,6,4,1,0,1,8,2,0.795004,0.83259,0.853366,0.830315,0.838049,0.851089,2.666667,4,1,0,1,8,2
readabl,6,5,10,10,8,10,8,0.793015,0.743467,0.760003,0.78729,0.825323,0.8292,8.5,5,10,10,8,10,8
effect,6,6,13,8,12,4,14,0.779712,0.729781,0.763479,0.765007,0.855466,0.777053,9.5,6,13,8,12,4,14
clearRepresent,6,7,3,6,3,1,4,0.77576,0.816268,0.786726,0.825861,0.891319,0.839215,4.0,7,3,6,3,1,4
read,6,8,2,1,0,13,1,0.769035,0.819623,0.827508,0.860136,0.819592,0.851841,4.166667,8,2,1,0,13,1
obvious,6,9,20,23,18,11,23,0.768906,0.649622,0.624845,0.67635,0.82502,0.645469,17.333333,9,20,23,18,11,23


In [7]:
for fa_nb in fa_loadings_dict.keys():
    fa_loadings_dict[fa_nb].to_csv(dir+'All_stimuli_EFA_loadings-'+fa_nb+'_factors.csv')

# Analysis for aggregated data EFA results

In [8]:
factors_numbers = ['1','2','3','4','5']
agg_efa_dfs = {}
fa_nbs = range(1,6,1)
for nb in fa_nbs:
    n = str(nb)
    this_df = import_EFA_file('Agg', factors = n)
    agg_efa_dfs.update({
        n:this_df
    })
    if nb > 2:
        this_df = import_EFA_file('Agg', factors = n, efa_method='ML')
        agg_efa_dfs.update({
            f'{n}ML':this_df
        })


In [9]:
agg_efa_dfs2 = {}
for n in agg_efa_dfs.keys():
    if len(n) == 1:
        method = 'PA'
    elif len(n) ==3:
        method = 'ML'
    this_df = agg_efa_dfs[n]
    df2 = set_of_observation(this_df, condition='Agg', fa_str=n, efa_method=method)[1]
    agg_efa_dfs2.update({
        n:df2
    })
    df2.to_csv(dir+'Agg/Agg_EFA_loadings-'+n+'_factors.csv')

agg_efa_dfs2['4']


Unnamed: 0_level_0,Cross-loadings in Agg,factor rank in PA1 Agg,PA1 Agg,factor rank in PA2 Agg,PA2 Agg,factor rank in PA4 Agg,PA4 Agg,factor rank in PA3 Agg,PA3 Agg,Order in Agg
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
obvious,0,1.0,1.065821,,,,,,,1.0
meanOveral,0,2.0,0.897276,,,,,,,2.0
confid,0,3.0,0.889801,,,,,,,3.0
represent,0,4.0,0.885327,,,,,,,4.0
understandEasi,0,5.0,0.845155,,,,,,,5.0
understandQuick,0,6.0,0.769944,,,,,,,6.0
meanElem,0,7.0,0.586231,,,,,,,7.0
answer,1,8.0,0.564585,,,,0.381155,,,8.0
lost,1,9.0,0.533415,,0.361014,,,,,9.0
confus,1,10.0,0.517438,,0.406965,,,,,10.0


In [10]:
agg_efa_dfs2['4ML']

Unnamed: 0_level_0,Cross-loadings in Agg,factor rank in ML1 Agg,ML1 Agg,factor rank in ML4 Agg,ML4 Agg,factor rank in ML2 Agg,ML2 Agg,factor rank in ML3 Agg,ML3 Agg,Order in Agg
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
obvious,0,1.0,1.052303,,,,,,,1.0
meanOveral,0,2.0,0.887793,,,,,,,2.0
confid,0,3.0,0.886485,,,,,,,3.0
represent,0,4.0,0.880514,,,,,,,4.0
understandEasi,0,5.0,0.836848,,,,,,,5.0
understandQuick,0,6.0,0.767394,,,,,,,6.0
meanElem,0,7.0,0.579935,,,,,,,7.0
answer,1,8.0,0.540929,,0.40517,,,,,8.0
lost,1,9.0,0.533875,,,,0.363952,,,9.0
confus,1,10.0,0.519528,,,,0.411718,,,10.0
