## Import packges

In [1]:
import numpy as np
import pandas as pd
import os
import random
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples

In [2]:
sns.set_style('whitegrid');
plt.figure(figsize=(8, 6), dpi=100);

<Figure size 800x600 with 0 Axes>

In [3]:
root = './'
behavioural_data_root = root +  'behavioral_data/raw_data/' 

In [4]:
def remove_outliers(df, max_rt, min_rt, std_c=2.5):
    """
    Returns remove outliers from dataframes. Outlier RTs are bigger than
    max_rt and smaller than min_rt. Also RTsthat are out of -/+ (std_c * sd) 
    of mean RT interval are considered as outliers too.

    Parameters
    ----------
        df: pandas dataframe with rt column
        max_rt (float): maximum acceptable rt
        min_rt (float): minimum acceptable rt
        
    Optional Parameters
    ----------
        std_c (float) : Optional
            coefficient to define interval of non-outlier RTs
    
    Returns
    -------
        df: pandas dataframe without outliers  
    """
    mean = df['rt'].mean()
    sd = df['rt'].std()
    lower_thr = mean - std_c*sd
    upper_thr = mean + std_c*sd
    min_bound = max(min_rt, lower_thr)
    max_bound = min(max_rt, upper_thr)
    df = df[df['rt'] >= min_bound]
    df = df[df['rt'] <= max_bound]
    return df

### Reading behavioural raw data

Reading and modifing each behavioral data file and combining all of them into a single behavioral dataframe

non-english speakers are omitted from data

participant with file number 1988 is removed from data due the problem in reading it with pandas

participant with file number 436 had two set of demogrphic data part that the dempgraphic set with the incompelete demograhic data have been removed from file

In [5]:
dataframes = []
counter = 1
for count, filename in enumerate(os.listdir(behavioural_data_root)):
    df = pd.read_csv(behavioural_data_root + filename, names=['trial', 'string_id', 'string_type', 'accuracy', 'rt', 'string'])
    if df.iloc[-2, 4] != "English":
        continue
    df = df.dropna().drop('string_id', axis=1).drop([0, 1])
    df = df.iloc[:-2, :]
    ind = df.loc[df['trial'] == 'Univ'].index[0]
    df = df.drop([ind, ind+1], axis=0)
    # Dropping rows with wrong accuracies
    df = df.loc[(df['accuracy'] == '0') | (df['accuracy'] == '1')]
    # Converting columns type to suitable data types
    convert_dict = {'string_type': 'int16',
                    'accuracy': 'int16',
                    'rt': float
                   }

    df = df.astype(convert_dict)
    # Convert RTs to seconds
    df['rt'] = df['rt'].apply(lambda x: x/1000) 
    # Removing Outliers
    df = remove_outliers(df, 3, .2, 2.5)
    # Extracting response of participant from his/her accuracy
    df['response'] = np.logical_not(np.logical_xor(df['string_type'], df['accuracy'])).astype('int')
    df = df.reset_index(drop=True)
    # Particpant number
    df['participant'] = counter
    df['quantile']=''
    df['minRT'] = df['rt'].min()
    dataframes.append(df)
    counter += 1
behavioural_df = pd.concat(dataframes)

In [6]:
behavioural_df.head()

Unnamed: 0,trial,string_type,accuracy,rt,string,response,participant,quantile,minRT
0,1,0,0,0.548,pracker,1,1,,0.378
1,2,1,1,0.646,nearside,1,1,,0.378
2,3,1,1,0.511,jets,1,1,,0.378
3,4,0,1,0.815,vates,0,1,,0.378
4,5,1,1,0.68,onward,1,1,,0.378


## Participant Selection

choosing 100 random participant and checking thier mean RT and Accuracy with total mean with t-test

In [7]:
all_participants = behavioural_df.groupby(['participant']).agg({'rt': ['mean'], 'accuracy': ['mean'],
                                                                'response':['mean']}).reset_index()
all_participants.columns = ['participant','rt', 'accuracy', 'response']

# Random selection
participants_id = random.sample(range(1, len(dataframes)), 100)
selected_participants = all_participants[all_participants['participant'].isin(participants_id)]

We want to check if our sample (100 selected participants) mean RT is seginficantly different from population (All the participants) mean RT or not?
<br>
so we use <b>One Sample T-test</b>

In [8]:
pg.ttest(selected_participants['rt'].to_numpy(), all_participants['rt'].mean())

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.064766,99,two-sided,0.948491,"[0.78, 0.85]",0.006477,0.111,0.050471


<b>There is no significant difference between selected participants RTs and all participants RT</b>

We also want to check if our sample (100 selected participants) mean Accuracy is seginficantly different from population (All the participants) mean Accuracy or not?
<br>
so we use <b>One Sample T-test</b>

In [9]:
pg.ttest(selected_participants['accuracy'].to_numpy(), all_participants['accuracy'].mean())

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.260803,99,two-sided,0.794786,"[0.85, 0.87]",0.02608,0.114,0.057677


<b>There is no significant difference between selected participants mean Accuracy and all participants mean Accuracy</b>

We also want to check if our sample (100 selected participants) mean response (choice) is seginficantly different from population (All the participants) mean response or not?
<br>
so we use <b>One Sample T-test</b>

In [10]:
pg.ttest(selected_participants['response'].to_numpy(), all_participants['response'].mean())

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.600524,99,two-sided,0.54953,"[0.47, 0.49]",0.060052,0.132,0.091402


<b>There is no significant difference between selected participants mean response (choice) and all participants mean response</b>

In [11]:
selected_participants = behavioural_df[behavioural_df['participant'].isin(participants_id)]

## Choosing 400 trials for 100 participants

Dividing data to 10 quantiles and sampling 40 random trials from each quantile for each participant

Check is done with qq-plots

In [12]:
ids = np.unique(selected_participants['participant'].to_numpy())

In [13]:
selected_participants_and_trials = []
with warnings.catch_warnings(record=True):
    for index, id in enumerate(ids):
        selected_participant = selected_participants[selected_participants['participant']==id]
        selected_participant['quantile'] = pd.qcut(selected_participant['rt'], 10);
        selected_participant_400 = selected_participant.groupby("quantile").sample(n=40)

        # Quantile-quantile plot
        fig, ax = plt.subplots()
        pp_x = sm.ProbPlot(selected_participant_400['rt'])
        pp_y = sm.ProbPlot(selected_participant['rt'])
        qqplot_2samples(pp_x, pp_y, xlabel="Selected RTs Quantiles",
                        ylabel="All RTs quantules", line=None, ax=ax)
        xlim = np.linspace(*ax.get_xlim())
        ax.plot(xlim, xlim, color='orange', label="45 degree line")
        plt.title('qq plot of participant ')
        ax.legend()
        plt.title('qq plot of participant ' + str(index+1))
        plt.savefig("Plots/qqplots/1/" + 'qq plot of participant ' + str(index+1) + '.pdf')
        plt.close()
        
        fig, ax = plt.subplots()
        x=np.quantile(np.sort(selected_participant['rt'].to_numpy()), np.linspace(0, 1, 400), interpolation='nearest')
        y=np.quantile(np.sort(selected_participant_400['rt'].to_numpy()), np.linspace(0, 1, 400), interpolation='nearest')
        sns.regplot(x=x, y=y, scatter_kws = {'color': 'Orange', 'alpha': 0.3},
                    line_kws = {'color': 'steelblue', 'label':'regression line',
                               'linewidth':3})
        xlim = np.linspace(*ax.get_xlim())
        ax.plot(xlim, xlim, color='red', alpha=0.2,
                label="45 degree line", linewidth=3)
        ax.legend()
        plt.title('qq plot of participant ' + str(index+1))
        plt.savefig("Plots/qqplots/2/" + 'qq plot of participant ' + str(index+1) + '.pdf')
        plt.close()
        
        selected_participants_and_trials.append(selected_participant_400)

In [14]:
final_df = pd.concat(selected_participants_and_trials)

In [15]:
final_df.groupby('string_type').count()

Unnamed: 0_level_0,trial,accuracy,rt,string,response,participant,quantile,minRT
string_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,19929,19929,19929,19929,19929,19929,19929,19929
1,20071,20071,20071,20071,20071,20071,20071,20071


In [16]:
final_df = final_df.drop(['trial', 'string_type', 'quantile'], axis=1)
 # Adding new particpant ID column for Stan
final_df['participant_id'] = final_df['participant'].replace(ids, list(range(1, len(ids)+1)))
final_df.reset_index(inplace=True, drop=True)

In [17]:
final_df

Unnamed: 0,accuracy,rt,string,response,participant,minRT,participant_id
0,0,0.634,sypnotized,1,2,0.477,1
1,0,0.554,daybreek,1,2,0.477,1
2,1,0.523,stir,1,2,0.477,1
3,1,0.583,Holkes,0,2,0.477,1
4,1,0.645,vurtain,0,2,0.477,1
...,...,...,...,...,...,...,...
39995,1,1.106,tasmask,0,805,0.435,100
39996,0,1.081,torkscrew,1,805,0.435,100
39997,1,0.972,titmeuse,0,805,0.435,100
39998,1,0.992,coercion,1,805,0.435,100


In [18]:
final_df.to_csv(root+"behavioral_data/selected_data/LDT_data.csv", header=0, index=False)