# Participant exclusion and participant log creation

In this file we construct the participant log for the online data while taking into account exclusion criteria such as mother tongues, deficits and the time it takes for the participant to finish the task

Written by Ana Hoban, 08/2024

In [191]:
import numpy as np 
import pandas as pd
import csv

# Importing the files containing the demographic information

### Be careful to run only the cell that you need in this section


## Importing English APS 

In [329]:
############### FOR APS ###################
lang = 'eng'
output = 'Logs/LOG_English_online_aps.csv'
time_constraint = 120 #minutes

#data
raw_data = pd.read_csv("C:/Users/anaho/Desktop/Data_online_eng_aps/screener.csv")
df = raw_data.dropna(subset=['Participant Public ID'])

#get the data to evaluate the accuracy over all trials
file_all_data = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/EnglishOnline/english_online_apsData_formatted.csv'
for_accuracy = pd.read_csv(file_all_data)

#get the attractiveness survey data to check the time it took to do the experiment
final = pd.read_csv("C:/Users/anaho/Desktop/Data_online_eng_aps/attractiveness_data_raw.csv")

## Importing English Silent

In [332]:
############ FOR CONTROL ###############
lang = 'eng'
output = 'Logs/LOG_English_online_silent.csv'
time_constraint = 60 #minutes

#data
raw_data = pd.read_csv("C:/Users/anaho/Desktop/silent online task data/screener.csv")
df = raw_data.dropna(subset=['Participant Public ID'])

#get the data to evaluate the accuracy over all trials
file_all_data = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/EnglishOnline/english_online_silentData_formatted.csv'
for_accuracy = pd.read_csv(file_all_data)

#get the attractiveness survey data to check the time it took to do the experiment
final = pd.read_csv("C:/Users/anaho/Desktop/silent online task data/questionnaire_final_message.csv")

## Importing French APS

In [370]:
############## FOR APS ##################
lang = 'fr'
output = 'Logs/LOG_French_online_aps.csv'
time_constraint = 120 #minutes

#data
raw_data = pd.read_csv("C:/Users/anaho/Desktop/Data_Amandine_APS/Data_Amandine_APS/screener.csv", encoding = 'latin1', delimiter = ',')
df = raw_data.dropna(subset=['Participant Public ID'])

#getting the time of the last answer
all_data = []

for i in range(1,9):    
    data_file = open(("C:/Users/anaho/Desktop/Data_Amandine_APS/Data_Amandine_APS/list{}.csv".format(i)), "r",encoding = 'latin1')
    data = list(csv.reader(data_file, delimiter=",")) #skipping header
    data_file.close()
    all_data += data

all_keys = all_data[0] #read in column names 

final = pd.DataFrame(all_data, columns = all_keys)

final = final.loc[final['Trial Number'] == 'END TASK',  ['Participant Public ID','UTC Timestamp']] #keep only end times

#get the data to evaluate the accuracy over all trials
file_all_data = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/FrenchOnline/french_online_apsData_formatted.csv'
for_accuracy = pd.read_csv(file_all_data)


## Importing French Silent

In [360]:
############ FOR CONTROL ###############
lang = 'fr'
output = 'Logs/LOG_French_online_silent.csv'
time_constraint = 60 #minutes

#data
raw_data = pd.read_csv("C:/Users/anaho/Desktop/Amandine_silent_reading_csv/screener.csv", encoding = 'latin1', delimiter = ';')
df = raw_data.dropna(subset=['Participant Public ID'])

#getting the time of the last answer
all_data = []

for i in range(1,5):    
    
    data_file = open(("C:/Users/anaho/Desktop/Amandine_silent_reading_csv/list{}.csv".format(i)), "r",encoding = 'latin1')
    
    data = list(csv.reader(data_file, delimiter=";")) #skipping header
    data_file.close()
    all_data += data

all_keys = all_data[0] #read in column names 

raw_responses = pd.DataFrame(all_data, columns = all_keys)

final = pd.DataFrame(all_data, columns = all_keys)

final = final.loc[final['Trial Number'] == 'END TASK', ['Participant Public ID','UTC Timestamp']] #keep only end times

#get the data to evaluate the accuracy over all trials
file_all_data = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/FrenchOnline/french_online_silentData_formatted.csv'
for_accuracy = pd.read_csv(file_all_data)


# Exclusion

In [371]:
#important question keys
question_keys = ['id','gender','age_days','age_month','age_year','studyLevel','deficit','maternalEnglish','otherLanguages', 'group']

In [372]:
# for the french data, must translate the question keys 
transl = {'Genre-1': 'Gender-1',
          'Genre-2': 'Gender-2',
          "Niveau d'Ã©tude": 'Level of Study',
          "Niveau d'Ã©tude-quantised": 'Level of Study-quantised',
          "Niveau d'Ã©tude-text": 'Level of Study-text',
          'DÃ©ficit-1' : 'Deficit-1',
          'DÃ©ficit-text': 'Deficit-text', 
          'Langue maternelle-1': 'Maternal language-1' , 
          'Langue maternelle-text': 'Maternal language-text',
          'Langue maternelle-other': 'Maternal language-other',
          'langue parlÃ©e-1': 'Spoken language-1',
          'langue parlÃ©e-other': 'Spoken Languages-other', 
          'langue parlÃ©e-text': 'Spoken Languages-text'}

if lang == 'fr':          
    df = df.replace({"Question Key": transl})

In [373]:
#remove some of the participants (i.e. if they have than one mother tongue or a deficit)
to_exclude = []

for i, ID in enumerate(df['Participant Public ID'].unique()):
    
    #checking if other mother tongue
    if True not in list(df[df['Participant Public ID'] == ID]['Question Key'].str.contains('Maternal language-1')):
        print(ID, 'doesnt have only english as a mother tongue:')
        print(df.loc[(df['Participant Public ID'] == ID) & (df['Question Key'] == 'Maternal language-text'), 'Response'])
        print('------------')
        to_exclude.append(ID)
     
    #checking if deficit
    if True not in list(df[df['Participant Public ID'] == ID]['Question Key'].str.contains('Deficit-1')):
        print(ID, 'has a deficit: ')
        print(df.loc[(df['Participant Public ID'] == ID) & (df['Question Key'] == 'Deficit-text'), 'Response'])
        print('------------')
        to_exclude.append(ID)
            
 #slightly diff code to retrieve start and end times

    if lang == 'eng':
        
        #check the time it took then to do the experiment
        start_time = list(df[df['Participant Public ID'] == ID]['UTC Timestamp'])[0]
        end_time = list(final[final['Participant Public ID'] == ID]['UTC Timestamp'])[0]
        total_time = (end_time-start_time)/60/1000 #to compute the difference in MINUTES between two UTC timestamps we divide by 60 and 1000 since the times are in milliseconds
    
        if total_time > time_constraint: #if they took more than two hours for aps or 1 for control
            print(ID, 'took more than {} minutes : '.format(time_constraint), total_time)
            print('------------')
            to_exclude.append(ID)
            
    if lang == 'fr':
        #check the time it took then to do the experiment
        start_time = list(df[df['Participant Public ID'] == ID]['UTC Timestamp'])[0]
        end_time = int(list(final[final['Participant Public ID'] == ID]['UTC Timestamp'])[0])
        total_time = (end_time-start_time)/60/1000 #to compute the difference in MINUTES between two UTC timestamps we divide by 60 and 1000 since the times are in milliseconds
    
        if total_time > time_constraint: #if they took more than two hours for aps or 1 for control
            print(ID, 'took more than {} minutes : '.format(time_constraint), total_time)
            to_exclude.append(ID)
            
to_exclude  = list(set(to_exclude)) #in case a participant is excluded for two reasons, just need to be on the list once

sk111 took more than 120 minutes :  1618.07855
smaya28 doesnt have only english as a mother tongue:
318    L'allemand
Name: Response, dtype: object
------------
smaya28 has a deficit: 
316    Myopie
Name: Response, dtype: object
------------
smaya26 has a deficit: 
370    myope
Name: Response, dtype: object
------------
sf306 took more than 120 minutes :  2612.773466666667
sq217 has a deficit: 
839    Port de lunette
Name: Response, dtype: object
------------
sg207 has a deficit: 
929    leger deficit auditif
Name: Response, dtype: object
------------
sm113 took more than 120 minutes :  1738.0441


In [374]:
# we must exclude the participants who have not been on task
#to do this, we run this code one time to get the log, create tne dataset, then we can use the raw dataset to test for this condition, 
# exclude more participants if needed
#keep only participants that have had more than 0.55 accuracy on ALL TRIALS

for i, ID in enumerate(for_accuracy['Session_Name_'].unique()):    
    acc_tot = len(for_accuracy.loc[(for_accuracy['Session_Name_'] == ID) & (for_accuracy['PARAPHRASE_ACCURACY'] == 1)])/144
    
    if acc_tot < 0.55:
        print(ID, acc_tot) 
        to_exclude.append(ID)    

## The following cell must be run only to manually adjust the list of participants to exclude

In [375]:
#to adjust the excluded participants:
print('suggested exclusion', to_exclude)

#put here the participants you want to keep
to_keep = ['sq217', 'smaya26', 'smaya28', 'sg207','sxp124', 'sg207','sxp124'
           '66c4460e502819ffa17ebe23','66a74d3256a08f1e9a52aeb5','668b9ef3f88084c4945bc24d',
           '6650a9ed115b3903e40e8943','66709a4a49e9bb2c80c53c27','6475e3b08c14ba10d9e6f70f'] #you can modify this
to_exclude =  [i for i in to_exclude if i not in to_keep]

print('final exclusion list', to_exclude)
print('that is {} participants'.format(len(to_exclude)))

suggested exclusion ['sm113', 'sq217', 'smaya28', 'sf306', 'sg207', 'sk111', 'smaya26']
final exclusion list ['sm113', 'sf306', 'sk111']
that is 3 participants


In [376]:
#remove the lines of this participant
print('must exclude:', len(to_exclude), 'participant(s)')

#exclude them
for i in to_exclude:
    df = df.drop(df[df['Participant Public ID'] == i].index)
    
print('In total, we have {} participants'.format(len(df['Participant Public ID'].unique())))

must exclude: 3 participant(s)
In total, we have 77 participants


# Building the demographics .csv file

In [367]:
#rename a few elements
df.loc[df['Question Key'] == "Gender-1", 'Question Key'] = 'Gender'
df.loc[df['Question Key'] == "Gender-2", 'Question Key'] = 'Gender'
df.loc[df['Question Key'] == "Gender-other", 'Question Key'] = 'Gender'
df.loc[df['Question Key'] == "Level of Study-quantised", 'Question Key'] = 'Study'
df.loc[df['Question Key'] == "Level of Study-text", 'Question Key'] = 'Study'

In [368]:
#initialize demographics df
demographics = pd.DataFrame(columns  = question_keys, index = np.arange(len(df['Participant Public ID'].unique())))

for i, ID in enumerate(df['Participant Public ID'].unique()):

    gender   = df.loc[ (df['Participant Public ID'] == ID) & (df['Question Key'] == 'Gender'), 'Response'].values[0]
    day      = df.loc[ (df['Participant Public ID'] == ID) & (df['Question Key'] == 'Age-day'), 'Response'].values[0]
    month    = df.loc[ (df['Participant Public ID'] == ID) & (df['Question Key'] == 'Age-month'), 'Response'].values[0]
    year     = df.loc[ (df['Participant Public ID'] == ID) & (df['Question Key'] == 'Age-year'), 'Response'].values[0]
    study    = df.loc[ (df['Participant Public ID'] == ID) & (df['Question Key'] == 'Study'), 'Response'].values[0]
    
    if df.loc[ (df['Participant Public ID'] == ID) & (df['Question Key'] == 'Spoken Languages-text')] is not np.nan:
        spoken   = df.loc[ (df['Participant Public ID'] == ID) & (df['Question Key'] == 'Spoken Languages-text'), 'Response'].values[0]
    

    demographics.loc[i] = pd.Series({'id': ID, 'gender': gender , 'age_days': day, 'age_month' : month, 'age_year': year, 'studyLevel': study, 'otherLanguages': spoken})
    
demographics['group'] = 'aps'

In [369]:
#save the file
demographics.to_csv(output, index = False)