## Import packges

In [1]:
import numpy as np
import pandas as pd
import os
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples
import random
import warnings

In [2]:
root = './'
behavioural_data_root = root +  'behavioral_data/raw_data/' 

In [3]:
def remove_outliers(df, max_rt, min_rt, std_c=2.5):
    """
    Returns remove outliers from dataframes. Outlier RTs are bigger than
    max_rt and smaller than min_rt. Also RTsthat are out of -/+ (std_c * sd) 
    of mean RT interval are considered as outliers too.

    Parameters
    ----------
        df: pandas dataframe with rt column
        max_rt (float): maximum acceptable rt
        min_rt (float): minimum acceptable rt
        
    Optional Parameters
    ----------
        std_c (float) : Optional
            coefficient to define interval of non-outlier RTs
    
    Returns
    -------
        df: pandas dataframe without outliers  
    """
    mean = df['rt'].mean()
    sd = df['rt'].std()
    lower_thr = mean - std_c*sd
    upper_thr = mean + std_c*sd
    min_bound = max(min_rt, lower_thr)
    max_bound = min(max_rt, upper_thr)
    df = df[df['rt'] >= min_bound]
    df = df[df['rt'] <= max_bound]
    return df

### Reading behavioural raw data

Reading and modifing each behavioral data file and combining all of them into a single behavioral dataframe

omitting non-english speakers are omitted from data

data 1988 is removed from data due the problem in reading it with pandas

data 436 had two demogrphic data part that one tha incompelete demograhic data have been removed 

In [4]:
dataframes = []
counter = 1
for count, filename in enumerate(os.listdir(behavioural_data_root)):
    df = pd.read_csv(behavioural_data_root + filename, names=['trial', 'string_id', 'string_type', 'accuracy', 'rt', 'string'])
    if df.iloc[-2, 4] != "English":
        continue
    df = df.dropna().drop('string_id', axis=1).drop([0, 1])
    df = df.iloc[:-2, :]
    ind = df.loc[df['trial'] == 'Univ'].index[0]
    df = df.drop([ind, ind+1], axis=0)
    # Dropping rows with wrong accuracies
    df = df.loc[(df['accuracy'] == '0') | (df['accuracy'] == '1')]
    # Converting columns type to suitable data types
    convert_dict = {'string_type': 'int16',
                    'accuracy': 'int16',
                    'rt': float
                   }

    df = df.astype(convert_dict)
    # Convert RTs to seconds
    df['rt'] = df['rt'].apply(lambda x: x/1000) 
    # Removing Outliers
    df = remove_outliers(df, 3, .2, 2.5)
    # Extracting response of participant from his/her accuracy
    df['response'] = np.logical_not(np.logical_xor(df['string_type'], df['accuracy'])).astype('int')
    df = df.reset_index(drop=True)
    # Particpant number
    df['participant'] = counter
    df['quantile']=''
    df['minRT'] = df['rt'].min()
    dataframes.append(df)
    counter += 1
behavioural_df = pd.concat(dataframes)

In [5]:
behavioural_df

Unnamed: 0,trial,string_type,accuracy,rt,string,response,participant,quantile,minRT
0,1,0,0,0.548,pracker,1,1,,0.378
1,2,1,1,0.646,nearside,1,1,,0.378
2,3,1,1,0.511,jets,1,1,,0.378
3,4,0,1,0.815,vates,0,1,,0.378
4,5,1,1,0.680,onward,1,1,,0.378
...,...,...,...,...,...,...,...,...,...
3263,3370,1,1,0.589,welt,1,806,,0.423
3264,3371,0,1,0.708,clameworthy,0,806,,0.423
3265,3372,0,1,0.817,esperience,0,806,,0.423
3266,3373,0,1,0.736,Ebe,0,806,,0.423


## Participant Selection

choosing 5 random participant and checking thier mean with total mean with t-test

In [9]:
all_participant_rt = behavioural_df.groupby(['participant']).agg({'rt': ['mean']}).reset_index()
all_participant_rt.columns = ['participant','rt']

# Random selection
participants_id = random.sample(range(1, len(dataframes)), 10)
selected_participant_rt = all_participant_rt[all_participant_rt['participant'].isin(participants_id)]

### chechking sample variance

In [10]:
np.var(all_participant_rt['rt'].to_numpy()) / np.var(selected_participant_rt['rt'].to_numpy())

1.1205172504180558

The ratio of the larger sample variance to the smaller sample variance is less than 4.

In [11]:
stats.ttest_ind(a=selected_participant_rt['rt'].to_numpy(), b=all_participant_rt['rt'].to_numpy(), equal_var=True)

Ttest_indResult(statistic=-0.04587530245387638, pvalue=0.9634208910785669)

<b>There is no significant difference between selected participants RTs and all participants RT</b>

In [12]:
selected_participants = behavioural_df[behavioural_df['participant'].isin(participants_id)]

## Choosing 400 trials for 100 participants

Dividing data to 10 quantiles and sampling 40 random trials from each quantile for each participant

In [13]:
ids = np.unique(selected_participants['participant'].to_numpy())

In [14]:
selected_participants_and_trials = []
with warnings.catch_warnings(record=True):
    for index, id in enumerate(ids):
        selected_participant = selected_participants[selected_participants['participant']==id]
        selected_participant['quantile'] = pd.qcut(selected_participant['rt'], 10);
        selected_participant_400 = selected_participant.groupby("quantile").sample(n=40)        
        selected_participants_and_trials.append(selected_participant_400)

In [15]:
final_df = pd.concat(selected_participants_and_trials)

In [16]:
final_df.groupby('string_type').count()

Unnamed: 0_level_0,trial,accuracy,rt,string,response,participant,quantile,minRT
string_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2037,2037,2037,2037,2037,2037,2037,2037
1,1963,1963,1963,1963,1963,1963,1963,1963


In [17]:
final_df = final_df.drop(['trial', 'string_type', 'quantile'], axis=1)
final_df['participant'] = final_df['participant'].replace(ids, list(range(1, len(ids)+1)))
final_df.reset_index(inplace=True, drop=True)

In [18]:
final_df

Unnamed: 0,accuracy,rt,string,response,participant,minRT
0,1,0.465,tiebreuk,0,1,0.353
1,1,0.436,prefab,1,1,0.353
2,1,0.446,Greg,1,1,0.353
3,1,0.466,loodlum,0,1,0.353
4,1,0.457,needing,1,1,0.353
...,...,...,...,...,...,...
3995,1,1.246,hymon,0,10,0.336
3996,0,1.072,acuity,0,10,0.336
3997,1,1.185,unmentionable,1,10,0.336
3998,0,1.472,philologists,0,10,0.336


In [19]:
final_df.to_csv(root+"behavioral_data/selected_data/LDT_data_test.csv", header=0, index=False)