In [1]:
import pandas as pd
import numpy as np
import copy
import os
from os.path import exists

## Variables

In [2]:
# Declare our variables
dir = '../Results'

#per stimuli out directory
outdir = f'{dir}/Data_Analysis'

if not os.path.exists(outdir):
    os.makedirs(outdir)

conditions = [
    'A',
    'B',
    'C',
]

seq_order_cols = [
    'seqGA',
    'seqGB',
    'seqGC'
    ]

greadability_metrics = {
    'file':f'{dir}/compare_greadability.csv',
    'stimuli_names':{
        'network1':'A',
        'network2':'B',
        'network3':'C'
    }
}

correct_answers = "OK"

#different groups of column names

demographics = [
    'Age',
    'Gender',
    'Education',
    'English_primary',
    'English_fluent',
    'colorDeficiency',
    'colorDeficiency_comment_'
]
        
scales = {
    "Understand":[
        "obvious",
        "represent",
        "understandEasi"
        ],
    "Layout":[
        "crowd",
        "distract",
        "messi"
        ],
    "DataFeat":[
        "visibl",
        "see"
        ],
    "DataRead":[
        "find",
        "identifi",
        "inform"
        ]
}

sub_items = [
    "crowd",
    "distract",
    "find",
    "identifi",
    "inform",
    "messi",
    "obvious",
    "represent",
    "see",
    "understandEasi",
    "visibl",
]

tasks_list = ["TaskFEN","TaskFAN"]

other_items = [f'{c}RatingsComment' for c in conditions]

always_ignore_cols = [
    'startlanguage',
    'startdate',
    'datestamp'
]

BFI_items = {
    "BFIScale":[
        "extravert",
        "reserved",
        "average"
    ]
}

greadability_items = [
    'crossing',
    'crossingAngle',
    'angularResolutionMin',
    'angularResolutionDev',
    'average'
]

## Functions

In [3]:
def find_col_names(my_df,
                  my_string,
                  range_max=30): #the range defines how far in the col name string we will look for the string to match
    col_names = []
    for col in list(my_df):
        if my_string in col[0:range_max+1]:
            col_names.append(col)
    return col_names


def combine_questions_codes(items_list, combine_with, comb_position='after_item', prefix='', suffix='', sep =''):
    combined_list = []
    for comb in combine_with:
        for item in items_list:
            if comb_position == 'after_item':
                combination = prefix+item+sep+comb+suffix
                combined_list.append(combination)
            elif comb_position == 'before_item':
                combination = prefix+comb+sep+item+suffix
                combined_list.append(combination)
    return combined_list

def drop_times(my_df):
    len_before = len(list(my_df))
    for col_name in list(my_df):
        if 'Time' in col_name:
            my_df = my_df.drop(col_name, axis=1)
    print(f'Dropped {len_before-len(list(my_df))} Question time columns')
    return my_df

def drop_full_NaN_cols(my_df):
    list_before = list(my_df)
    my_df = my_df.dropna(axis=1, how='all')
    print(f'Dropped {len(list_before)-len(list(my_df))} fully empty columns')
    dropped_cols_list = [col for col in list_before if col not in list(my_df)]
    print(dropped_cols_list)
    return my_df

def drop_lines_by_NaN_in_col(my_df, my_col, na_param = "na"):
    excluded_df = pd.DataFrame(columns = list(my_df))
    len_before = len(my_df)
    for i, row in my_df.iterrows():
        if na_param == "na":
            if pd.isna(my_df.at[i, my_col]):
                excluded_df.loc[i]= row
                my_df.drop(index=[i], inplace=True)
        elif na_param == "not_na":
            if not pd.isna(my_df.at[i, my_col]):
                excluded_df.loc[i]= row
                my_df.drop(index=[i], inplace=True)
        else:
            print("Improper value for na_param: either 'na' or 'not_na' (default='na')")
    print(f'Dropped {len_before-len(my_df)} participants')
    return (my_df, excluded_df)

### Rating items

In [4]:
rating_items_by_condition = {}
all_rating_items = []
initial_rating_items_cols = [] #before we clean the names (then we will clean them)

for c in conditions:
    rating_items_by_condition.update({
        c:{}
    })
    for scale in scales.keys():
        items_scale_condition = combine_questions_codes(scales[scale], [f'{scale}_'], comb_position='before_item', prefix=c)
        initial_cols = combine_questions_codes(scales[scale], [f'{scale}_'], comb_position='before_item', prefix=c, suffix=f'{c}_')
        rating_items_by_condition[c].update({
            scale: items_scale_condition
        })
        all_rating_items += items_scale_condition
        initial_rating_items_cols += initial_cols


rating_items_by_condition

{'A': {'Understand': ['AUnderstand_obvious',
   'AUnderstand_represent',
   'AUnderstand_understandEasi'],
  'Layout': ['ALayout_crowd', 'ALayout_distract', 'ALayout_messi'],
  'DataFeat': ['ADataFeat_visibl', 'ADataFeat_see'],
  'DataRead': ['ADataRead_find', 'ADataRead_identifi', 'ADataRead_inform']},
 'B': {'Understand': ['BUnderstand_obvious',
   'BUnderstand_represent',
   'BUnderstand_understandEasi'],
  'Layout': ['BLayout_crowd', 'BLayout_distract', 'BLayout_messi'],
  'DataFeat': ['BDataFeat_visibl', 'BDataFeat_see'],
  'DataRead': ['BDataRead_find', 'BDataRead_identifi', 'BDataRead_inform']},
 'C': {'Understand': ['CUnderstand_obvious',
   'CUnderstand_represent',
   'CUnderstand_understandEasi'],
  'Layout': ['CLayout_crowd', 'CLayout_distract', 'CLayout_messi'],
  'DataFeat': ['CDataFeat_visibl', 'CDataFeat_see'],
  'DataRead': ['CDataRead_find', 'CDataRead_identifi', 'CDataRead_inform']}}

### Task questions

In [5]:
tasks_questions = combine_questions_codes(tasks_list, conditions, comb_position='before_item', prefix='', suffix='')
tasks_questions

['ATaskFEN', 'ATaskFAN', 'BTaskFEN', 'BTaskFAN', 'CTaskFEN', 'CTaskFAN']

### BFI subquestions

In [6]:
BFI_subitems = combine_questions_codes(BFI_items['BFIScale'], ['BFIScale_'], comb_position='before_item')
BFI_subitems

['BFIScale_extravert', 'BFIScale_reserved', 'BFIScale_average']

# Load data and remove non necessary columns

In [7]:
# df = pd.read_csv(f'{dir}/results_cleaned.csv', dtype={'seed': object}).set_index('seed')
df = pd.read_csv(f'{dir}/results_cleaned.csv', converters = {'seed': str}).set_index('seed')

# clean the rating items names: AUnderstand_obvious_A to AUnderstand_obvious
df = df.rename(columns=lambda x: x[:-2] if x in initial_rating_items_cols else x)

# clean col names with '_' as the last character
df = df.rename(columns=lambda x: x[:-1] if x[-1]=='_' else x)

In [8]:
all_seeds = list(df.index)
if len(set(all_seeds)) < len(all_seeds):
    print(f'{len(all_seeds)-len(set(all_seeds))}')
    raise IndexError('Not all seeds are unique !!!')

In [9]:
list(df)

['Answers_count',
 'submitdate',
 'lastpage',
 'Age',
 'Gender',
 'Education',
 'English_primary',
 'English_fluent',
 'refurl',
 'colorDeficiency',
 'colorDeficiency_comment',
 'Intro',
 'seqGA',
 'seqGB',
 'seqGC',
 'ATaskFEN',
 'ATaskFAN',
 'ATaskTopic',
 'AUnderstand_obvious',
 'AUnderstand_represent',
 'AUnderstand_understandEasi',
 'AUnderstand_attentionCheckA',
 'ALayout_crowd',
 'ALayout_messi',
 'ALayout_distract',
 'ADataFeat_visibl',
 'ADataFeat_see',
 'ADataRead_find',
 'ADataRead_identifi',
 'ADataRead_inform',
 'ARatingsComment',
 'BTaskFEN',
 'BTaskFAN',
 'BTaskTopic',
 'BUnderstand_obvious',
 'BUnderstand_represent',
 'BUnderstand_understandEasi',
 'BLayout_crowd',
 'BLayout_messi',
 'BLayout_distract',
 'BDataRead_find',
 'BDataRead_identifi',
 'BDataRead_inform',
 'BDataRead_attentionCheckB',
 'BDataFeat_visibl',
 'BDataFeat_see',
 'BRatingsComment',
 'CTaskFEN',
 'CTaskFAN',
 'CTaskTopic',
 'CTopicError',
 'CTaskTopicTryAgain',
 'CUnderstand_obvious',
 'CUnderstand_r

## BFI items processing
1. We reverse score for "reserved" question from short Big Five Inventory because "reserved" and "extravert" are reverted from each other:
- reserved = "I see myself as reserved, quiet"
- extravert = "I see myself as extraverted, enthusiastic"

2. We calculate a mean


In [10]:
df['BFIScale_reserved'] = 8 - df['BFIScale_reserved']
df['BFIScale_average'] = df[['BFIScale_reserved','BFIScale_extravert']].mean(axis=1)

## Keep only useful columns for data analysis

In [11]:
used_columns = [d for d in demographics if d in df.columns] #this is conditional because the colorDeficiency_comment_ might have been dropped during data cleaning if it was completely empty (0 comment from participants)
used_columns += seq_order_cols # order of stimuli display
used_columns += tasks_questions # reading tasks
used_columns += all_rating_items # rating items for all stimuli and all subscales
used_columns += other_items # comment for each stimulus
used_columns += BFI_subitems # Extraversion subscale items
df = df[used_columns]

# Add greadability metrics
Greadability metrics all range from 0 to 1, but our scale items range from 1 to 7.

Therefore, we will scale the metrics by a factor of 7 to allow better comparison of variances.

In [12]:
greadability_metrics

{'file': '../Results/compare_greadability.csv',
 'stimuli_names': {'network1': 'A', 'network2': 'B', 'network3': 'C'}}

In [13]:
#get he greadability values
gr_df = pd.read_csv(greadability_metrics['file']).rename(columns = {'Unnamed: 0':'stimulus'})
gr_df['stimulus'] = gr_df['stimulus'].apply(lambda x: greadability_metrics['stimuli_names'][x[:-13]]) #remove "_greadability" (last 13 characters) at the end of each stimulus name, and replace by the stimulus letter in our study
gr_df = gr_df.set_index('stimulus')

# inflate the values to allow better co-variance comparisons with scale items (from 1 to 7)
# gr_df = gr_df*7

greadability_metrics_types = list(gr_df)

# generate names of cols to add in the answer df to integrate the greadability metric: for each stimulus and each metric
greadability_cols = combine_questions_codes(greadability_metrics_types, conditions, comb_position='before_item', sep='_gr_') # as a result col[0] = stimulus letter and col[5:] = metric name
greadability_cols

for col in greadability_cols:
    df[col] = gr_df.at[col[0],col[5:]] #create each col in answer df and fill it with the values from greadability

In [14]:
gr_df

Unnamed: 0_level_0,crossing,crossingAngle,angularResolutionMin,angularResolutionDev,average
stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,0.973,0.715,0.893,0.908,0.87225
B,0.879,0.846,0.641,0.776,0.7855
C,0.643,0.784,0.575,0.713,0.67875


In [15]:
greadability_cols

['A_gr_crossing',
 'A_gr_crossingAngle',
 'A_gr_angularResolutionMin',
 'A_gr_angularResolutionDev',
 'A_gr_average',
 'B_gr_crossing',
 'B_gr_crossingAngle',
 'B_gr_angularResolutionMin',
 'B_gr_angularResolutionDev',
 'B_gr_average',
 'C_gr_crossing',
 'C_gr_crossingAngle',
 'C_gr_angularResolutionMin',
 'C_gr_angularResolutionDev',
 'C_gr_average']

## Random order analysis

In [16]:
df_order = df[seq_order_cols]
df_order.value_counts()

seqGA  seqGB  seqGC
4.0    2.0    3.0      42
3.0    4.0    2.0      30
       2.0    4.0      23
2.0    3.0    4.0      19
4.0    3.0    2.0      19
2.0    4.0    3.0      15
Name: count, dtype: int64

In [17]:
df_order = df[seq_order_cols].astype(int)
mapper = {
    2 : '1st',
    3 : '2d',
    4 : '3d',
}
df_order.replace(to_replace=mapper, inplace=True)
df_order.rename(columns=lambda x: f'stimulus {x[-1]}', inplace=True)

df_order.value_counts().to_csv(f'{outdir}/stimuli_randomization_report2.csv')
df_order.value_counts().to_latex(f'{outdir}/stimuli_randomization_report2.tex')

c_order_counts_dfs_list = []
for c in conditions:
    c_order_counts_df = pd.DataFrame(df_order.groupby([f'stimulus {c}'])[f'stimulus {c}'].value_counts())
    print(c_order_counts_df.columns)
    c_order_counts_df = c_order_counts_df.rename(columns={'count':f'Count in {c}'})
    c_order_counts_df = c_order_counts_df.rename(index=lambda x: 'Appeared '+x)
    c_order_counts_dfs_list += [c_order_counts_df]
df_order_counts = pd.concat(c_order_counts_dfs_list, axis=1)
df_order_counts.reset_index()

df_order_counts.to_csv(f'{outdir}/stimuli_randomization_report.csv')
df_order_counts.to_latex(f'{outdir}/stimuli_randomization_report.tex', index=True)


#to do: latin square https://en.wikipedia.org/wiki/Latin_square


Index(['count'], dtype='object')
Index(['count'], dtype='object')
Index(['count'], dtype='object')


In [18]:
df_order_counts

Unnamed: 0,Count in A,Count in B,Count in C
Appeared 1st,34,65,49
Appeared 2d,53,38,57
Appeared 3d,61,45,42


## Subscales central values

In [19]:
for c in conditions:
    for scale in rating_items_by_condition[c].keys():
        df[f'{c}{scale}_average'] = df[rating_items_by_condition[c][scale]].mean(axis=1)
        df[f'{c}{scale}_std'] = df[rating_items_by_condition[c][scale]].std(axis=1)
        df[f'{c}{scale}_median'] = df[rating_items_by_condition[c][scale]].median(axis=1)
df.to_csv(f'{outdir}/results_cleaned_with_central_values.csv')

# Create one line for answers on each stimulus

## df with individual rows for answer on each stimulus

In [20]:
#columns not related to stimuli
invariable_cols = [d for d in demographics if d in df.columns] #this is conditional because the colorDeficiency_comment_ might have been dropped during data cleaning if it was completely empty (0 comment from participants)
invariable_cols += BFI_subitems # Extraversion subscale items

# we take each stimulus answers
dfs_list = []
for c in conditions:
    this_stimulus_comments_col = [f'{c}RatingsComment']
    this_stimulus_tasks_cols = [f'{c}{task}' for task in tasks_list]
    this_greadability_cols = [col for col in greadability_cols if col[0]==c] #Greadability metrics for this stimulus
    this_stimulus_rating_cols = []
    this_stimulus_sbuscales_central_values = []
    for scale in rating_items_by_condition[c].keys():
        #take the ratings
        this_subscale_cols = rating_items_by_condition[c][scale]
        this_stimulus_rating_cols += this_subscale_cols
        
        #take the central values
        this_subscale_central_cols = [f'{c}{scale}_average',f'{c}{scale}_std',f'{c}{scale}_median']
        this_stimulus_sbuscales_central_values += this_subscale_central_cols
    
    cols_to_keep = invariable_cols + this_greadability_cols + this_stimulus_tasks_cols + this_stimulus_rating_cols + this_stimulus_sbuscales_central_values + this_stimulus_comments_col
    
    # take this slice of the main df
    this_df = copy.deepcopy(df[cols_to_keep])
    
    # we clean the names
    this_mapper = {}
    
    # clean items names:
    for gr_col in this_greadability_cols:
        if 'average' in gr_col:
            this_mapper.update(
                {gr_col : 'Greadability_average'} #here we drop the first letter (= stimulus letter)
            )
        else:
            this_mapper.update(
                {gr_col : gr_col[5:]} #here we drop the first letter (= stimulus letter)
            )
    for item in this_stimulus_tasks_cols+this_stimulus_comments_col+this_stimulus_sbuscales_central_values:
        this_mapper.update(
            {item : item[1:]} #here we drop the first letter (= stimulus letter)
        )
    for scale_item in this_stimulus_rating_cols+BFI_subitems:
        if 'BFIScale_average' in scale_item:
            this_mapper.update(
                {scale_item:'Extraversion_average'}
            )
        else:
            this_mapper.update(
                {scale_item: scale_item.split('_')[-1]} #here we drop anything before the '_' characther (for example 'AUnderstand' or 'CLayout')
            )

    this_df = this_df.rename(columns=this_mapper)
    
    # add a col with the stimulus' letter
    this_df.insert(0, 'stimulus', c)
    this_df.reset_index(inplace=True)
    dfs_list += [this_df]

# we reset index and concat all dfs
df_stimuli = pd.concat(dfs_list, axis=0, ignore_index=True)

In [21]:
df_stimuli.columns

Index(['seed', 'stimulus', 'Age', 'Gender', 'Education', 'English_primary',
       'English_fluent', 'colorDeficiency', 'extravert', 'reserved',
       'BFI_average', 'crossing', 'crossingAngle', 'angularResolutionMin',
       'angularResolutionDev', 'Greadability_average', 'TaskFEN', 'TaskFAN',
       'obvious', 'represent', 'understandEasi', 'crowd', 'distract', 'messi',
       'visibl', 'see', 'find', 'identifi', 'inform', 'Understand_average',
       'Understand_std', 'Understand_median', 'Layout_average', 'Layout_std',
       'Layout_median', 'DataFeat_average', 'DataFeat_std', 'DataFeat_median',
       'DataRead_average', 'DataRead_std', 'DataRead_median',
       'RatingsComment'],
      dtype='object')

In [22]:
df_stimuli.to_csv(f'{outdir}/full_answers_by_stimuli.csv')

# Data for multi-group CFA (one group = answers for one stimulus)

In [23]:
#for CFA we will only use items ratings and stimulus
cols_for_CFA = ['stimulus'] + sub_items
df_stimuli[cols_for_CFA].to_csv(f'{outdir}/multigroup_ratings.csv')

# not used in final analysis notebooks
# for c in conditions:
#     df_stimuli[df_stimuli['stimulus']==c][cols_for_CFA].to_csv(f'{outdir}/{c}_ratings.csv')

# Comments extraction

In [24]:
import re
pattern = r'\n'

all_comments = df[[col for col in df if 'Comment'in col]]
print(len(all_comments))

all_comments_cleaned = all_comments.replace(to_replace=' ', value=pd.NA)
all_comments.replace(to_replace=pattern, value=pd.NA, inplace=True)
all_comments.replace(to_replace='	', value=pd.NA, inplace=True)
all_comments_cleaned = all_comments_cleaned.dropna(how='all')
all_comments_cleaned = all_comments_cleaned.rename(columns=lambda x: x.replace('Ratings', '_') if 'Ratings' in x else x)
print(len(all_comments_cleaned))

148
35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_comments.replace(to_replace=pattern, value=pd.NA, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_comments.replace(to_replace='	', value=pd.NA, inplace=True)


In [25]:
all_comments_cleaned['count'] = all_comments_cleaned.isnull().sum(axis=1).tolist()
all_comments_cleaned

Unnamed: 0_level_0,A_Comment,B_Comment,C_Comment,count
seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
168101042569,That's a clear visualization.,"The left part is fine and clear, the right par...",I cannot understand if some lines continue aft...,0
177126480936,,Too many connections between Leonard's friends...,,2
138381221571,,This one has been the easiest for me to see th...,While the data is there if you spend some time...,1
45247857963,"It's easy to understand, though not necessaril...",This makes my brain scream. It's crowded and i...,This one basically made me tap out. This is th...,0
26379028454,Finally!,The lines are too densely packed to easily mak...,The lines need to be unambiguously bbypassing ...,0
69253389710,,"Wright up says first season, questions refer t...","You refer to first season, in the wright up bu...",1
136216531257,,"Some parts are confusing: Alice, for example, ...","Some connections are unclear, e.g. there is a ...",1
125631530338,,,The visualization is too crowded and the names...,2
71484869893,"With the extra space, it could be helpful to a...",,This diagram could be greatly improved by rear...,1
84480827166,"This time, the visualization was so much more ...","I think the visualization seams fine to read, ...","This one was very messy, compared to the others.",0


In [26]:
if not os.path.exists(f'{outdir}/comments'):
    os.makedirs(f'{outdir}/comments')

all_comments = []
for c in conditions:
    comments_df = df_stimuli[df_stimuli['stimulus']==c][['seed','RatingsComment']]
    comments_df = comments_df.dropna(axis=0).set_index('seed')
    comments_df.to_csv(f'{outdir}/comments/{c}_comments.csv')
    comments = comments_df.rename(columns={'RatingsComment':f'{c}_comments'})
    all_comments += [comments]

all_comments = pd.concat(all_comments, axis=1)
all_comments.to_csv(f'{outdir}/comments/all_comments.csv')

In [27]:
all_comments

Unnamed: 0_level_0,A_comments,B_comments,C_comments
seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
168101042569,That's a clear visualization.,"The left part is fine and clear, the right par...",I cannot understand if some lines continue aft...
45247857963,"It's easy to understand, though not necessaril...",This makes my brain scream. It's crowded and i...,This one basically made me tap out. This is th...
26379028454,Finally!,The lines are too densely packed to easily mak...,The lines need to be unambiguously bbypassing ...
71484869893,"With the extra space, it could be helpful to a...",,This diagram could be greatly improved by rear...
84480827166,"This time, the visualization was so much more ...","I think the visualization seams fine to read, ...","This one was very messy, compared to the others."
196822301272,the vis is most comprehensive and clear in who...,this vis/graph was much much better than the p...,"this vis is a bit messy, dots having different..."
160952659796,The layout is very clear and well formatted. T...,"The visualization is quite clear, but can get ...","This forest is pretty hard to read, with sever..."
55753433481,"Season 1 is less crowded than season 2 and 3, ...",this is slightly better and easier to read tha...,"The graph is a bit messy, but the dots give yo..."
110352545132,This visualisation is much better than the pre...,"Once again, using more colors would help.","The visualization is unreadable, because all c..."
165850282516,I do not understand how Diana and Charlie rela...,"Lines are overlapping, still overcrowded but m...","Very crowded, I understand the representation ..."


In [28]:
df_stimuli.columns

Index(['seed', 'stimulus', 'Age', 'Gender', 'Education', 'English_primary',
       'English_fluent', 'colorDeficiency', 'extravert', 'reserved',
       'BFI_average', 'crossing', 'crossingAngle', 'angularResolutionMin',
       'angularResolutionDev', 'Greadability_average', 'TaskFEN', 'TaskFAN',
       'obvious', 'represent', 'understandEasi', 'crowd', 'distract', 'messi',
       'visibl', 'see', 'find', 'identifi', 'inform', 'Understand_average',
       'Understand_std', 'Understand_median', 'Layout_average', 'Layout_std',
       'Layout_median', 'DataFeat_average', 'DataFeat_std', 'DataFeat_median',
       'DataRead_average', 'DataRead_std', 'DataRead_median',
       'RatingsComment'],
      dtype='object')

# Demographic data

In [39]:
def retrieve_demographics(this_df, my_demographics):
    #check we have demographics
    missing_d = [d for d in my_demographics if d not in this_df.columns]
    if len(missing_d) > 0:
        print(f'Demographics are missing: {missing_d}')
        updated_demographics = [demo for demo in my_demographics if demo not in missing_d]
        my_demographics = updated_demographics
        # return ''
    
    nb_p = len(this_df)

    this_data = {
        'Number of participants': nb_p
    }
    
    for d in my_demographics:
        if d == 'Age':
            this_df[d] = this_df[d].astype('Int64')
            this_data.update({
                f'{d} - average':this_df[d].mean().astype(float).round(3),
                f'{d} - std':int(this_df[d].std()),
                f'{d} - min':int(this_df[d].min()),
                f'{d} - max':int(this_df[d].max()),
            })
        else:
            grouped_df = this_df.groupby([d])[d].count()
            for value in grouped_df.index:
                this_data.update({
                    f'{d} - {value}':grouped_df.loc[value]
                })

    out_df = pd.DataFrame.from_dict(this_data, orient='index', columns=['N'])

    #frequencies
    out_df['freq'] = ''
    for i, row in out_df.iterrows():
        if not "Age" in i and not "Number" in i:
            out_df.at[i, 'freq'] = round(row['N']/nb_p, 2)
    return out_df

this_demographics = retrieve_demographics(df, demographics)
this_demographics.to_csv(f'{outdir}/all_demographics.csv')

Demographics are missing: ['colorDeficiency_comment_']
