In [31]:
import pandas as pd
import numpy as np
import copy
import os
import statistics

## Variables

In [32]:
# Declare our variables
dir = '../Results'

#per stimuli out directory
outdir = f'{dir}/Data_Analysis'

#create output folders
if not os.path.exists(outdir):
    os.makedirs(outdir)

if not os.path.exists(f'{outdir}/others'):
    os.makedirs(f'{outdir}/others')

if not os.path.exists(f'{outdir}/data'):
    os.makedirs(f'{outdir}/data')

conditions = [
    'A',
    'B',
    'C',
    'D',
    'E',
    'F'
]

#different groups of column names
Prolific_demographics = [
    'Age',
    'Gender',
    'Highest education level completed',
    'Language',
    'Fluent languages'
]

demographics = [
    'Age',
    'Gender',
    'Education',
    'English_primary',
    'English_fluent',
]


items = [
'answer',
'attentionCheck',
'clearData',
'clearRepresent',
'complex',
'confid',
'confus',
'crowd',
'deciph',
'distinguish',
'distract',
'effect',
'find',
'identifi',
'inform',
'lost',
'meanElem',
'meanOveral',
'messi',
'obvious',
'organiz',
'read',
'readabl',
'represent',
'see',
'simpl',
'understandEasi',
'understandQuick',
'valu',
'visibl',
]

#groups of answers (because we downladed the answers' values, not their codes)
#how we will code ratings
ratings = {
    'Strongly disagree':1,
    'Disagree':2,
    'Slightly disagree':3,
    'Neutral':4,
    'Slightly agree':5,
    'Agree':6,
    'Strongly agree':7,
    'Other': '' #NA for "Dont' know"
}

#correct answers for the 3 reading tasks, for each stimuli
correct_answers = {
    'A': {
        'ATaskRV':'20.76 Mbps',
        'ATaskFE':'Papua New Guinea',
        'ATaskTopic':'Broadband downloading speed in Oceania',
        'ATaskTopicTryAgain':'Broadband downloading speed in Oceania'
    },
    'B': {
        'BTaskRV':'3.55$',
        'BTaskFCT':'Increasing',
        'BTaskTopic':'Evolution of fruits prices between January and March',
        'BTaskTopicTryAgain':'Evolution of fruits prices between January and March'
    },
    'C': {
        'CTaskFE':'USA',
        'CTaskMC':'more from China',
        'CTaskTopic':'Distribution of students enrolled in an online program',
        'CTaskTopicTryAgain':'Distribution of students enrolled in an online program'
    },
    'D': {
        'DTaskMC':'True',
        'DTaskFE':'Housing',
        'DTaskTopic':'Average distribution of European households’ spendings',
        'DTaskTopicTryAgain':'Average distribution of European households’ spendings'
    },
    'E': {
        'ETaskRV':'False',
        'ETaskCl':'3',
        'ETaskTopic':'The family tree of a student',
        'ETaskTopicTryAgain':'The family tree of a student'
    },
    'F': {
        'FTaskRV':'17,636€',
        'FTaskFCT':'Increasing or slightly increasing',
        'FTaskTopic':'Evolution of sales profits on office supplies and equipment',
        'FTaskTopicTryAgain':'Evolution of sales profits on office supplies and equipment'
    }
}

## Functions

In [33]:
def find_col_names(my_df,
                  my_string,
                  range_max=30): #the range defines how far in the col name string we will look for the string to match
    col_names = []
    for col in list(my_df):
        if my_string in col[0:range_max+1]:
            col_names.append(col)
    return col_names


def combine_questions_codes(items_list, combine_with, comb_position='after_item', prefix='', suffix=''):
    combined_list = []
    for comb in combine_with:
        for item in items_list:
            if comb_position == 'after_item':
                combination = prefix+item+comb+suffix
                combined_list.append(combination)
            elif comb_position == 'before_item':
                combination = prefix+comb+item+suffix
                combined_list.append(combination)
    return combined_list

def drop_times(my_df):
    len_before = len(list(my_df))
    for col_name in list(my_df):
        if 'Time' in col_name:
            my_df = my_df.drop(col_name, axis=1)
    print(f'Dropped {len_before-len(list(my_df))} Question time columns')
    return my_df

def drop_full_NaN_cols(my_df):
    list_before = list(my_df)
    my_df = my_df.dropna(axis=1, how='all')
    print(f'Dropped {len(list_before)-len(list(my_df))} fully empty columns')
    dropped_cols_list = [col for col in list_before if col not in list(my_df)]
    print(dropped_cols_list)
    return my_df

def drop_lines_by_NaN_in_col(my_df, my_col, na_param = "na"):
    excluded_df = pd.DataFrame(columns = list(my_df))
    len_before = len(my_df)
    for i, row in my_df.iterrows():
        if na_param == "na":
            if pd.isna(my_df.at[i, my_col]):
                excluded_df.loc[i]= row
                my_df.drop(index=[i], inplace=True)
        elif na_param == "not_na":
            if not pd.isna(my_df.at[i, my_col]):
                excluded_df.loc[i]= row
                my_df.drop(index=[i], inplace=True)
        else:
            print("Improper value for na_param: either 'na' or 'not_na' (default='na')")
    print(f'Dropped {len_before-len(my_df)} participants')
    return (my_df, excluded_df)

## Separate data by stimulus

In [34]:
df = pd.read_csv(f'{dir}/results_cleaned.csv', dtype={'seed': object}).set_index('seed')
demographics += ['colorDeficiency','colorDeficiency_comment_']

  df = pd.read_csv(f'{dir}/results_cleaned.csv', dtype={'seed': object}).set_index('seed')


In [35]:
#check uniqueness of ids
all_seeds = list(df.index)
if len(set(all_seeds)) < len(all_seeds):
    print(f'{len(all_seeds)-len(set(all_seeds))}')
    raise IndexError('Not all seeds are unique !!!')

In [36]:
#check if combine_questions_codes finds all rating item columns
for stimulus_letter in conditions:
    list_1 = combine_questions_codes(items, stimulus_letter, comb_position='after_item', prefix='', suffix='')
    list_2 = []
    for col in list(df):
        if col[-1] == stimulus_letter and col[-2]!="M" and col[-2]!="F": #for TaskMC and F for TaskFE
            list_2.append(col)

    print(f'{len(list_1)} columns names found from generative function with {stimulus_letter}',
        f'\n{len(list_2)} columns names found in df ending with {stimulus_letter}')
    if len(list_1) > len(list_2):
        print(f'{[value for value in list_1 if value not in list_2]} missing from generative function')
    elif len(list_2) > len(list_1):
        print(f'{[value for value in list_2 if value not in list_1]} missing from the dataframe columns')
    else:
        print("All rating items columns found, we're good to go.")

30 columns names found from generative function with A 
30 columns names found in df ending with A
All rating items columns found, we're good to go.
30 columns names found from generative function with B 
30 columns names found in df ending with B
All rating items columns found, we're good to go.
30 columns names found from generative function with C 
30 columns names found in df ending with C
All rating items columns found, we're good to go.
30 columns names found from generative function with D 
30 columns names found in df ending with D
All rating items columns found, we're good to go.
30 columns names found from generative function with E 
30 columns names found in df ending with E
All rating items columns found, we're good to go.
30 columns names found from generative function with F 
30 columns names found in df ending with F
All rating items columns found, we're good to go.


In [37]:
def create_stimulus_df(original_df, stimulus_str, filtering_col, filtering_value):
    print(f'=========\nCreating df(s) for stimulus "{stimulus_str}"')
    #filter rows with answers for this stimulus
    stimulus_df = original_df.query(f'{filtering_col} == {filtering_value}')
    
    #drop times
    stimulus_df = drop_times(stimulus_df)

    #drop full NaNs
    stimulus_df = drop_full_NaN_cols(stimulus_df)
    
    #create lists of answers codes
    itemsAnswers = combine_questions_codes(items, stimulus_str, comb_position='after_item', prefix='', suffix='')

    #create lists of reading task questions codes
    taskAnswers = [code for code in list(stimulus_df.columns) if stimulus_str+'Task' in code]

    #create answer df
    colsAnswers = ['seed']+demographics+taskAnswers+itemsAnswers #cols to keep
    #colsAnswers = itemsAnswers #cols to keep
    dfAnswers = stimulus_df.filter(colsAnswers, axis=1) #filter df with cols to keep
    dfAnswers = dfAnswers.replace(ratings) # Replace ratings by scores (or empty string for "I don't know")
    dfAnswers = dfAnswers.mask(dfAnswers == '') #this replaces empty strings with NaNs ("other" selection)
    #center scores on 0
    for col_found in itemsAnswers:
        dfAnswers[col_found] = dfAnswers[col_found].sub(4, fill_value=None, axis=0)

    #if there are others, we create a df too
    itemsOthers = [item+'_other_' for item in itemsAnswers if item+'_other_' in list(stimulus_df.columns)]
    print(f'{len(itemsOthers)} other answers found') #since we alread removed full NaN columns
    if len(itemsOthers)>0:
        print(itemsOthers)
        colsOthers = ['seed']+demographics+itemsOthers
        dfOthers = stimulus_df.filter(colsOthers, axis=1)
        
        #extract comments only
        dfOthers_comments = dfOthers.dropna(subset=itemsOthers, how='all')
        if len(dfOthers_comments) > 0:
            dfOthers_comments = dfOthers_comments.T
        
        dfOthers = dfOthers.T
        
    else:
        dfOthers = False
        dfOthers_comments = False
        
    return dfAnswers, dfOthers, dfOthers_comments

In [38]:
#we run for each stimuli letter & corresponding rand number
letter_to_number = {
    'A':1,
    'B':2,
    'C':3,
    'D':4,
    'E':5,
    'F':6
}

#we create a dict to hold our separate dfs
initial_dfs = {}

for condition in conditions:
    cond_dfs = create_stimulus_df(df, condition, 'rand', letter_to_number[condition])
    
    #spread the results in the dict
    initial_dfs.update({
        condition: cond_dfs[0],
        f'{condition}_others': cond_dfs[1],
        f'{condition}_others_comments': cond_dfs[2]
    })

Creating df(s) for stimulus "A"
Dropped 207 Question time columns
Dropped 250 fully empty columns
['BTaskRV', 'BTaskFCT', 'BTaskTopic', 'answerB', 'answerB_other_', 'attentionCheckB', 'clearDataB', 'clearRepresentB', 'crowdB', 'complexB', 'complexB_other_', 'confidB', 'confusB', 'messiB', 'messiB_other_', 'deciphB', 'deciphB_other_', 'distinguishB', 'distractB', 'distractB_other_', 'effectB', 'findB', 'identifiB', 'informB', 'lostB', 'lostB_other_', 'meanElemB', 'meanOveralB', 'obviousB', 'organizB', 'representB', 'readB', 'readablB', 'readablB_other_', 'seeB', 'seeB_other_', 'simplB', 'understandEasiB', 'understandQuickB', 'valuB', 'visiblB', 'CTaskFE', 'CTaskMC', 'CTaskTopic', 'CTopicError', 'CTaskTopicTryAgain', 'answerC', 'attentionCheckC', 'clearDataC', 'clearRepresentC', 'clearRepresentC_other_', 'crowdC', 'complexC', 'confidC', 'confusC', 'confusC_other_', 'messiC', 'deciphC', 'distinguishC', 'distinguishC_other_', 'distractC', 'distractC_other_', 'effectC', 'findC', 'identifiC'

## Export stimuli full answers tables

In [39]:
#csv export
for k, v in initial_dfs.items():
    if "others" not in k and isinstance(v, pd.DataFrame):
        v.to_csv(f'{outdir}/initial-results-{k}.csv')
        print(f'Exported {k} as csv')
    elif "comments" not in k and isinstance(v, pd.DataFrame):
        v.to_csv(f'{outdir}/others/intial-results-{k}.csv', header=False)
        print(f'Exported {k} as csv in "other" dir')
    elif isinstance(v, pd.DataFrame):
        v.to_csv(f'{outdir}/others/intial-results-{k}.csv', header=False)
        print(f'Exported {k} as csv in "other" dir')
    elif v == False:
        print(f'{k} has no "I don\'t know" answer to export')
    else:
        print(f'{k}: {type(v)}')

Exported A as csv
Exported A_others as csv in "other" dir
Exported A_others_comments as csv in "other" dir
Exported B as csv
Exported B_others as csv in "other" dir
Exported B_others_comments as csv in "other" dir
Exported C as csv
Exported C_others as csv in "other" dir
Exported C_others_comments as csv in "other" dir
Exported D as csv
Exported D_others as csv in "other" dir
Exported D_others_comments as csv in "other" dir
Exported E as csv
Exported E_others as csv in "other" dir
Exported E_others_comments as csv in "other" dir
Exported F as csv
Exported F_others as csv in "other" dir
Exported F_others_comments as csv in "other" dir


## Participants exclusion

### Functions we will use here

In [40]:
def count_rows_NaNs(my_df):
    #create a col to store the value in the df
    my_df['NaN_counts']=''
    for i in my_df.index:
        nan_count = my_df.loc[[i]].isna().sum().sum()
        my_df.at[i, 'NaN_counts'] = nan_count
    return my_df

def exclude_participants(this_condition, correct_answers_dict, df_dict, NA_threshold = 12, topic_attempts_threshold = 2, multiple_exclusion_processing = 'join'):
    this_df = df_dict[this_condition]
    initial_len = len(this_df)

    # incorrect answers on first attempt at topic
    topic_code = this_condition+'TaskTopic'
    this_correct_answer = correct_answers_dict[this_condition][topic_code]
    dfTopicExcluded = this_df[this_df[topic_code] != this_correct_answer]
    dfTopicExcluded.insert(0,"exclusion","Failed topic at first attempt")

    #incorrect rating in attentionCheck item
    dfCalibrationExcluded = this_df[this_df[f'attentionCheck{this_condition}'] != 1]
    dfCalibrationExcluded.insert(0,"exclusion","Failed attention check")
    
    # more than 40% (more than 11 out of 29) "I don't know" selections
    this_df = count_rows_NaNs(this_df)
    dfNaNExcluded = this_df[this_df['NaN_counts'] >= NA_threshold]
    dfNaNExcluded.insert(0,"exclusion","Over 40 percent of DK/NA answers")

    this_exclusion_dfs = {}

    this_exclusion_dfs.update({
        'wrong_calibration':dfCalibrationExcluded,
        'too_many_NA':dfNaNExcluded,
    })

    #move this up if you want this reason to have priority over the others for participants with multiple exclusion causes
    if topic_attempts_threshold == 1:
        this_exclusion_dfs.update({
            'wrong_topic':dfTopicExcluded
        })


    #all excluded participants
    merged_df = pd.concat(
        [df for df in this_exclusion_dfs.values()],
        axis=0,
        join="outer",
        ignore_index=False,
        keys=None,
        levels=None,
        names=None,
        verify_integrity=False,
        copy=True,
    )

    #drop exact duplicates (is it still needed??)
    merged_df = merged_df.drop_duplicates(subset=None, keep='first')

    # some people have multiple reasons for exclusion
    if multiple_exclusion_processing == 'join': ## this option joins the string contents in "exclusion" column
        aggregate_parameters = {}
        for col in list(merged_df):
            if col == 'exclusion':
                aggregate_parameters.update({
                    col: ', '.join,
                })
            else:
                aggregate_parameters.update({
                    col: 'first',
                })
    
        merged_df = merged_df.groupby('seed').agg(aggregate_parameters)
    
    
    elif multiple_exclusion_processing == 'first': ## this options just keeps the first for all (so in order of dict iteration = order of insertion, garanteed in python 3.11)
        merged_df = merged_df.groupby('seed').agg('first')

    this_exclusion_dfs.update({
        'all':merged_df,
    })

    excluded_len = len(merged_df)
    print(f'\n{excluded_len} participants exluded for condition {this_condition} ({initial_len} participants in total).')
    print(f'= {round((1-(excluded_len/initial_len))*100,2)}% valid answers')

    return this_exclusion_dfs


#test of option 1 in multiple reason processing: join strings
this = exclude_participants('B', correct_answers, initial_dfs, NA_threshold=12, topic_attempts_threshold=1, multiple_exclusion_processing = 'join')



6 participants exluded for condition B (294 participants in total).
= 97.96% valid answers


### We set aside excluded participants

In [41]:
exclusion_dfs = {}

for condition in conditions:
    exclusion_dfs[condition] = exclude_participants(condition,
                                                    correct_answers,
                                                    initial_dfs,
                                                    NA_threshold=12, # 40% of 29 items = 11.6 so 12 and more
                                                    topic_attempts_threshold=1, # 1 means we exclude based on wrong answer at first attempt. 2 means we keep all answers because people who were wrong twice got redirected out of the survey immediately
                                                    multiple_exclusion_processing = 'join')


3 participants exluded for condition A (291 participants in total).
= 98.97% valid answers

6 participants exluded for condition B (294 participants in total).
= 97.96% valid answers

6 participants exluded for condition C (293 participants in total).
= 97.95% valid answers

6 participants exluded for condition D (315 participants in total).
= 98.1% valid answers

6 participants exluded for condition E (299 participants in total).
= 97.99% valid answers

6 participants exluded for condition F (299 participants in total).
= 97.99% valid answers


In [42]:
#csv export
for condition in conditions:
    for k, v in exclusion_dfs[condition].items():
        if k == 'all' and isinstance(v, pd.DataFrame):
            v.to_csv(f'{outdir}/excluded-{condition}.csv')
            print(f"Exported {k} exclusions for {condition}")
        elif k == 'all': 
            print(f'{k}: {type(v)}') #in case something weird happened
        # else:
        #     print(f"Didn't export {k} for {condition}")

Exported all exclusions for A
Exported all exclusions for B
Exported all exclusions for C
Exported all exclusions for D
Exported all exclusions for E
Exported all exclusions for F


#### Grouping and counting exclusions to report them

In [43]:
all_exclusions_dfs = []

for condition in conditions:
    for k, v in exclusion_dfs[condition].items():
        if k == 'all' and isinstance(v, pd.DataFrame):
            v.insert(1,"Condition",condition)
            all_exclusions_dfs += [v]

exclusions_df = pd.concat(all_exclusions_dfs, axis=0)

#export grouped by reason (all together)
all_exclusions_df = exclusions_df.groupby(['exclusion']).size().reset_index(name="nb excluded")
all_exclusions_df.to_csv(f'{outdir}/excluded-all-counts.csv', index=False)

#export groupes by reason and by stimulus
exclusions_by_stimulus_df = exclusions_df.groupby(['Condition', 'exclusion']).size().reset_index(name="nb excluded")
exclusions_by_stimulus_df.to_csv(f'{outdir}/excluded-by_stimulus-counts.csv', index=False)

## Valid participations

### Functions we will use here

In [44]:
# we remove participants excluded in the previous step
def remove_excluded_p(my_condition, my_initial_dfs, my_exclusion_dfs):
    #get list of participants
    all_p_list = my_initial_dfs[my_condition].index
    #get list of excluded participants
    excluded_p_list = my_exclusion_dfs[my_condition]['all'].index
    #filter initial df
    clean_df = my_initial_dfs[my_condition].filter(items = [p for p in all_p_list if p not in excluded_p_list], axis=0)
    return clean_df

### Create "clean" dfs with valid answers only

In [45]:
# we create a dictionary to hold clean dfs
clean_dfs = {
    'A':{},
    'B':{},
    'C':{},
    'D':{},
    'E':{},
    'F':{}
}
for condition in conditions:
    clean_dfs[condition]['valid_answers'] = remove_excluded_p(condition, initial_dfs, exclusion_dfs)

for k, v in clean_dfs.items():
    if k in conditions: #avoid error for runs with partial conditions only
        print(f"{len(v['valid_answers'])} for {k}")

288 for A
288 for B
287 for C
309 for D
293 for E
293 for F


In [46]:
exclusion_dfs['A'].keys()

dict_keys(['wrong_calibration', 'too_many_NA', 'wrong_topic', 'all'])

In [47]:
#reporting table

report = {
    'valid_participations' : [len(clean_dfs[c]['valid_answers']) for c in conditions],
    'excluded_participations' : [len(exclusion_dfs[c]['all']) for c in conditions],
}

report_df = pd.DataFrame.from_dict(report, orient='index', columns=conditions)
report_df['Total'] = report_df.sum(axis=1)
report_df['Mean'] = report_df[conditions].mean(axis=1)
report_df['Std'] = report_df[conditions].std(axis=1)
report_df.to_csv(f'{outdir}/all-participations-counts.csv')
report_df


Unnamed: 0,A,B,C,D,E,F,Total,Mean,Std
valid_participations,288,288,287,309,293,293,1758,293.0,8.270429
excluded_participations,3,6,6,6,6,6,33,5.5,1.224745


### Extract valid items ratings for EFA

In [48]:
#for EFA we will only use items ratings
ratingItems = copy.deepcopy(items)
ratingItems.remove('attentionCheck')
for condition in conditions:
    itemsAnswers = combine_questions_codes(ratingItems, condition, comb_position='after_item', prefix='', suffix='')
    dfRatings = clean_dfs[condition]['valid_answers'].filter(itemsAnswers, axis=1)
    # we remove the letter from the item's name:
    for col in list(dfRatings):
        dfRatings.rename(columns={col:col[:-1]}, inplace=True)
    clean_dfs[condition]['ratings'] = dfRatings
    print(f"{len(dfRatings.columns)} items for {condition}")

29 items for A
29 items for B
29 items for C
29 items for D
29 items for E
29 items for F


In [49]:
clean_dfs['A'].keys()

dict_keys(['valid_answers', 'ratings'])

### Retrieve demographic data

In [50]:
def retrieve_demographics(this_df, col_key, my_demographics):
    #check we have demographics
    missing_d = [d for d in my_demographics if d not in this_df.columns]
    if len(missing_d) > 0:
        print(f'Demographics are missing: {missing_d} in {col_key}')
        updated_demographics = [demo for demo in my_demographics if demo not in missing_d]
        my_demographics = updated_demographics
        # return ''
    
    this_data = {
        'Number of participants':len(this_df)
    }
    
    for d in my_demographics:
        if d == 'Age':
            this_df[d] = this_df[d].astype('Int64')
            this_data.update({
                f'{d} - average':this_df[d].mean().astype(float).round(3),
                f'{d} - std':int(this_df[d].std()),
                f'{d} - min':int(this_df[d].min()),
                f'{d} - max':int(this_df[d].max()),
            })
        else:
            grouped_df = this_df.groupby([d])[d].count()
            for value in grouped_df.index:
                this_data.update({
                    f'{d} - {value}':grouped_df.loc[value]
                })

    out_df = pd.DataFrame.from_dict(this_data, orient='index', columns=[col_key])
    out_df[col_key] = out_df[col_key].astype(object)
    return out_df

all_demographics = []
for condition in conditions:
    this_df = clean_dfs[condition]['valid_answers']
    this_demographics = retrieve_demographics(this_df, condition, demographics)
    clean_dfs[condition].update({
        'demographics': this_demographics
    })
    all_demographics.append(this_demographics)

all_demo_df = pd.concat(all_demographics, axis=1)
all_demo_df['Mean'] = all_demo_df[conditions].mean(axis=1).astype(float).round(3)
all_demo_df['Std'] = all_demo_df[conditions].std(axis=1).astype(float).round(3)
all_demo_df['Sum'] = all_demo_df[conditions].sum(axis=1)
all_demo_df['Freq'] = all_demo_df[conditions].sum(axis=1)/all_demo_df.at['Number of participants','Sum']
# all_demo_df.round(3).to_csv(f'{outdir}/all_demographics.csv', float_format="%.3f")

#we get global mean and std from the original df
all_demo_df.at['Age - average', 'Sum'] = df['Age'].mean()
all_demo_df.at['Age - std', 'Sum'] = df['Age'].std()

all_demo_df.to_csv(f'{outdir}/all_demographics.csv')

### Output ratings

In [51]:
# we replace letters with numbers for R script
numbers = {
    'A':'1',
    'B':'2',
    'C':'3',
    'D':'4',
    'E':'5',
    'F':'6'
}

#csv export
ratings_df_for_agg = []
for condition in conditions:
    for k, v in clean_dfs[condition].items():
        if isinstance(v, pd.DataFrame):
            match k:
                case 'ratings':
                    v.to_csv(f'{outdir}/data/{k}-{numbers[condition]}.csv')
                    v.insert(0,'stimulus',f'{condition}')
                    ratings_df_for_agg.append(v)
                case 'valid_answers' | 'demographics':
                    v.to_csv(f'{outdir}/{k}-{condition}.csv')
                case _:
                      print(f'Something weird happened in {k} with df {v}')
        else: 
            print(f'{k}: {type(v)}') #in case something weird happened

#concat all ratings df with the stimulus col
pd.concat(ratings_df_for_agg).to_csv((f'{outdir}/data/ratings-stimulus.csv'))
#concat all ratings df without the stimulus col
pd.concat([df.drop('stimulus', axis=1) for df in ratings_df_for_agg]).to_csv((f'{outdir}/data/ratings-7.csv'))