## This notebook serves to build a dataset similar to the one from the lab

 Written by Ana Hoban, last edited in 08/2024

In [1]:
import pandas as pd
import random
import numpy as np
import warnings
import csv
import openpyxl

warnings.filterwarnings('ignore')

## Loading the raw data

##### Be careful to change:
    1. language
    2. groupe
##### in the following cell.

In [2]:
#only thing to touch in this notebook:

#select language fr or ang
lang = 'fr'

#select group silent or aps
gr = 'aps'

#### _make changes in the following cell only if you want the directories or input/output filenames to be different_
_*might have to change the delimiter as well_

In [3]:
#loading in all lists
all_data = []

if gr == 'silent':
    MAX = 5
elif gr == 'aps':
    MAX = 9
    
print(MAX)

for i in range(1,MAX): 
    # 1. online silent english data
    if (gr == 'silent') & (lang == 'eng'):
        data_file = open(("C:/Users/anaho/Desktop/silent online task data/taskdata_list{}.csv".format(i)), "r",encoding = 'utf-8')
        log = pd.read_csv('C:/Users/anaho/Desktop/research/language/aps/analysis/Data/Logs/LOG_English_online_silent.csv')
        output_alex = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/EnglishOnline/english_online_silentData.csv'
        output_formatted = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/EnglishOnline/english_online_silentData_formatted.csv'

    # 2. online aps english data
    if (gr == 'aps') & (lang == 'eng'):
        data_file = open(("C:/Users/anaho/Desktop/Data_online_eng_aps/list{}.csv".format(i)), "r",encoding = 'utf-8')
        log = pd.read_csv('C:/Users/anaho/Desktop/research/language/aps/analysis/Data/Logs/LOG_English_online_aps.csv')
        output_alex = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/EnglishOnline/english_online_apsData.csv'
        output_formatted = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/EnglishOnline/english_online_apsData_formatted.csv'
    
    # 3. online silent french data:
    if (gr == 'silent') & (lang == 'fr'):
        data_file = open(("C:/Users/anaho/Desktop/Amandine_silent_reading_csv/list{}.csv".format(i)), "r",encoding = 'latin1')
        log = pd.read_csv('C:/Users/anaho/Desktop/research/language/aps/analysis/Data/Logs/LOG_French_online_silent.csv')
        output_alex = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/FrenchOnline/french_online_silentData.csv'
        output_formatted = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/FrenchOnline/french_online_silentData_formatted.csv'
        
    # 4. online aps french data:
    if (gr == 'aps') & (lang == 'fr'):
        data_file = open(("C:/Users/anaho/Desktop/Data_Amandine_APS/Data_Amandine_APS/list{}.csv".format(i)), "r",encoding = 'latin1')
        log = pd.read_csv('C:/Users/anaho/Desktop/research/language/aps/analysis/Data/Logs/LOG_French_online_aps.csv')
        output_alex = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/FrenchOnline/french_online_apsData.csv'
        output_formatted = 'C:/Users/anaho/Desktop/research/Language/APS/analysis/Data/Processed data/FrenchOnline/french_online_apsData_formatted.csv'

    data = list(csv.reader(data_file, delimiter=",")) #skipping header
    data_file.close()
    all_data += data

all_keys = all_data[0] #read in column names 
df = pd.DataFrame(all_data, columns = all_keys)

#but we don't need all keys, let's keep only the following
keys = [ 'Participant Public ID',
         'Participant Status',
         'Spreadsheet',
         'Spreadsheet Row',
         'Trial Number',
         'Screen Number',
         'Screen Name',
         'Reaction Time',
         'Response',
         'Correct',
         'Paraphrase',
         'Phrase',
         'Item',
         'Structure',
         'Plausibility']

#photo key is only for APS
if MAX == 9:
    keys = keys + ['Photo']

9


In [4]:
#define columns
df = df[df['Participant Public ID'].isin(log['id'].unique())] 

#drop the columns that do not have data
df = df[(df.Structure != '')]
#df = df[(df.Structure != 'filler')]
df = df[(df['Participant Status'] == 'complete')]

#keep only needed keys
df = df[keys]

#making sure we have the right number of participants
print('we have ' , len(df['Participant Public ID'].unique()),'participants')

#for control
#print('we have ' , len(silent_df)/( len(silent_df['Participant Public ID'].unique())*2), 'target trials per participant')
#for aps
print('we have ' , len(df)/( len(df['Participant Public ID'].unique())*3), 'target trials per participant on avg')

we have  77 participants
we have  144.004329004329 target trials per participant on avg


In [5]:
#putting the accuracy information on the same line as the RT

for ID in df['Participant Public ID'].unique():
    for item in df.loc[df['Participant Public ID'] == ID, 'Spreadsheet Row'].unique():
        paraphrase_answer = df.loc[ (df['Participant Public ID'] == ID ) & ( df['Spreadsheet Row'] == item)  & ( df['Screen Name'] == 'Screen : Paraphrase'), 'Correct'].values
        
         # Debug: print the extracted values
        #print(f'ID: {ID}, Row: {item}, Paraphrase Answer: {paraphrase_answer}')
        
        #set new column for answer:
        if len(paraphrase_answer) > 0: #check there is a value
            paraphrase_answer = paraphrase_answer[0]
            df.loc[
                (df['Participant Public ID'] == ID) &
                (df['Spreadsheet Row'] == item) &
                (df['Screen Name'] == 'Screen : Phrase'),
                'answer'
            ] = paraphrase_answer
        

In [9]:
df.answer.unique()

array([nan, '1', '0'], dtype=object)

In [10]:
#now we must remove all the paraphrase lines
df = df[df['Screen Name'] == 'Screen : Phrase']
print('Total number of target items: ', len(df))
print('Nb of target items per participant check: ', len(df)/len(df['Participant Public ID'].unique()))

#and assign blocks to the items: first 12 are in block 1, ... 

for ID in df['Participant Public ID'].unique():
    df[df['Participant Public ID'] == ID]
        

Total number of target items:  11088
Nb of target items per participant check:  144.0


In [11]:
# Initialize the 'block' column to 0
df['block'] = 0

# Iterate over unique Participant Public ID
for ID in df['Participant Public ID'].unique():
    # Filter the dataframe for the current ID and only targets
    participant_df = df.loc[(df['Participant Public ID'] == ID) & (df['Structure'] != 'filler'),'Trial Number'].drop_duplicates() 
    
    # Check if the participant_df has exactly 48 rows
    if len(participant_df) == 48:
        # Assign blocks to each quarter
        df.loc[participant_df.index[:12], 'block'] = 1
        df.loc[participant_df.index[12:24], 'block'] = 2
        df.loc[participant_df.index[24:36], 'block'] = 3
        df.loc[participant_df.index[36:], 'block'] = 4
    else:
        print(f'Participant {ID} does not have exactly 48 rows.')
        print(len(participant_df))

In [12]:
# Initialize the 'condition' column
df['condition'] = None

# Set conditions based on the specified rules
df.loc[(df['Structure'] == 'SRC') & (df['Plausibility'] == 'plausible'), 'condition'] = 1
df.loc[(df['Structure'] == 'SRC') & (df['Plausibility'] == 'implausible'), 'condition'] = 2
df.loc[(df['Structure'] == 'ORC') & (df['Plausibility'] == 'plausible'), 'condition'] = 3
df.loc[(df['Structure'] == 'ORC') & (df['Plausibility'] == 'implausible'), 'condition'] = 4

In [13]:
correct_df = df[df.answer == '1']

print('Total number of correct target items: ', len(correct_df))
print('Proportion: ', len(correct_df)/len(df))

Total number of correct target items:  9751
Proportion:  0.8794191919191919


In [21]:
if MAX == 9 : #for aps dataset only
    #must set speaker column to non-native or native
    correct_df.speaker = ''
    correct_df.loc[correct_df.Photo == 'WF.jpg', 'speech_condition'] = 'non-native' 
    correct_df.loc[correct_df.Photo == 'AF.jpg', 'speech_condition'] = 'native' 

    df.speaker = ''
    df.loc[df.Photo == 'WF.jpg', 'speech_condition'] = 'non-native' 
    df.loc[df.Photo == 'AF.jpg', 'speech_condition'] = 'native' 

In [22]:
#download this dataset for Alex
correct_df.to_csv(output_alex, header= True, index = False)

In [23]:
#create dictionnary to make column name match those of eyetracking data to be able to use that code
col_names_dict = {"Reaction Time": "SENTENCE_RT", "Participant Public ID": "Session_Name_", "answer": "PARAPHRASE_ACCURACY",
             "Plausibility": "plausibility_condition", "Structure": "syntactic_condition"}

df['group'] = 'aps' 
df['sentence_type'] = 'target' 


df = df.rename(columns= col_names_dict)
df.SENTENCE_RT =  pd.to_numeric(df.SENTENCE_RT)

In [24]:
df.to_csv(output_formatted, header= True, index = False, encoding = 'latin1')