## Import packges

In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

import numpy as np
import pandas as pd
import random
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg
import statsmodels.api as sm

from statsmodels.graphics.gofplots import qqplot_2samples
from utils.utils import get_rt_quantiles, remove_outliers

In [2]:
sns.set_style("whitegrid");
plt.figure(figsize=(8, 6), dpi=100);

<Figure size 800x600 with 0 Axes>

In [3]:
root = "../"
behavioural_data_root = root +  "Datasets/behavioral_data/raw_data/"
data_output_path = root + "Datasets/behavioral_data/selected_data/LDT_data.csv"
qqplots_dir = "Results/Plots/qqplots/"

### Reading behavioural raw data

Reading and modifing each behavioral data file and combining all of them into a single behavioral dataframe

non-english speakers are omitted from data

participant with file number 1988 is removed from data due the problem in reading it with pandas

participant with file number 436 had two set of demogrphic data part that the dempgraphic set with the incompelete demograhic data have been removed from file

In [4]:
dataframes = []
counter = 1
for count, filename in enumerate(os.listdir(behavioural_data_root)):
    df = pd.read_csv(behavioural_data_root + filename, names=["trial", "string_id", "string_type", "accuracy", "rt", "string"])
    if df.iloc[-2, 4] != "English":
        continue
    df = df.dropna().drop("string_id", axis=1).drop([0, 1])
    df = df.iloc[:-2, :]
    ind = df.loc[df["trial"] == "Univ"].index[0]
    df = df.drop([ind, ind+1], axis=0)
    # Dropping rows with wrong accuracies
    df = df.loc[(df["accuracy"] == "0") | (df["accuracy"] == "1")]
    # Converting columns type to suitable data types
    convert_dict = {"string_type": "int16",
                    "accuracy": "int16",
                    "rt": float
                   }

    df = df.astype(convert_dict)
    # Convert RTs to seconds
    df["rt"] = df["rt"].apply(lambda x: x/1000) 
    # Removing Outliers
    df = remove_outliers(df, 3, .2, 2.5)
    # Extracting response of participant from his/her accuracy
    df["response"] = np.logical_not(np.logical_xor(df["string_type"], df["accuracy"])).astype("int")
    df = df.reset_index(drop=True)
    # Particpant number
    df["participant"] = counter
    df["quantile"]=""
    df["minRT"] = df["rt"].min()
    dataframes.append(df)
    counter += 1
behavioural_df = pd.concat(dataframes)

In [5]:
behavioural_df.head()

Unnamed: 0,trial,string_type,accuracy,rt,string,response,participant,quantile,minRT
0,1,0,0,0.548,pracker,1,1,,0.378
1,2,1,1,0.646,nearside,1,1,,0.378
2,3,1,1,0.511,jets,1,1,,0.378
3,4,0,1,0.815,vates,0,1,,0.378
4,5,1,1,0.68,onward,1,1,,0.378


## Participant Selection

choosing 100 random participant and checking thier mean RT and Accuracy with total mean with t-test

In [14]:
all_participants = behavioural_df.groupby(["participant"]).agg({"rt": ["mean"], "accuracy": ["mean"],
                                                                "response":["mean"]}).reset_index()
all_participants.columns = ["participant", "rt", "accuracy", "response"]

# Random selection
participants_id = random.sample(range(1, len(dataframes)), 100)
selected_participants = all_participants[all_participants["participant"].isin(participants_id)]

We want to check if our sample (100 selected participants) mean RT is seginficantly different from population (All the participants) mean RT or not?
<br>
so we use <b>One Sample T-test</b>

In [15]:
pg.ttest(selected_participants["rt"].to_numpy(), all_participants["rt"].mean())

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.021839,99,two-sided,0.98262,"[0.78, 0.84]",0.002184,0.111,0.050054


<b>There is no significant difference between selected participants RTs and all participants RT</b>

We also want to check if our sample (100 selected participants) mean Accuracy is seginficantly different from population (All the participants) mean Accuracy or not?
<br>
so we use <b>One Sample T-test</b>

In [16]:
pg.ttest(selected_participants["accuracy"].to_numpy(), all_participants["accuracy"].mean())

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.948686,99,two-sided,0.345091,"[0.84, 0.87]",0.094869,0.171,0.155624


<b>There is no significant difference between selected participants mean Accuracy and all participants mean Accuracy</b>

We also want to check if our sample (100 selected participants) mean response (choice) is seginficantly different from population (All the participants) mean response or not?
<br>
so we use <b>One Sample T-test</b>

In [17]:
pg.ttest(selected_participants["response"].to_numpy(), all_participants["response"].mean())

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.082735,99,two-sided,0.934229,"[0.48, 0.49]",0.008273,0.111,0.050769


<b>There is no significant difference between selected participants mean response (choice) and all participants mean response</b>

In [18]:
selected_participants = behavioural_df[behavioural_df["participant"].isin(participants_id)]

## Choosing 400 trials for 100 participants

Dividing data to 10 quantiles and sampling 40 random trials from each quantile for each participant

Check is done with qq-plots

In [19]:
ids = np.unique(selected_participants["participant"].to_numpy())

In [20]:
probs = np.linspace(0, 1, 400)
selected_participants_and_trials = []
with warnings.catch_warnings(record=True):
    for index, id in enumerate(ids):
        selected_participant = selected_participants[selected_participants["participant"]==id]
        selected_participant_400 = get_rt_quantiles(selected_participant, probs)

        # Quantile-quantile plot
        fig, ax = plt.subplots()
        pp_x = sm.ProbPlot(selected_participant_400["rt"])
        pp_y = sm.ProbPlot(selected_participant["rt"])
        qqplot_2samples(pp_x, pp_y, xlabel="Selected RTs Quantiles",
                        ylabel="All RTs quantules", line=None, ax=ax)
        xlim = np.linspace(*ax.get_xlim())
        ax.plot(xlim, xlim, color="orange", label="45 degree line")
        plt.title("qq plot of participant ")
        ax.legend()
        plt.title("qq plot of participant " + str(index+1))
        plt.savefig(qqplots_dir + "qq plot of participant " + str(index+1) + ".pdf")
        plt.close()
        
        selected_participants_and_trials.append(selected_participant_400)

In [21]:
final_df = pd.concat(selected_participants_and_trials)

In [22]:
final_df.groupby("string_type").count()

Unnamed: 0_level_0,trial,accuracy,rt,string,response,participant,quantile,minRT
string_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,19431,19431,19431,19431,19431,19431,19431,19431
1,20569,20569,20569,20569,20569,20569,20569,20569


In [23]:
final_df

Unnamed: 0,trial,string_type,accuracy,rt,string,response,participant,quantile,minRT
2524,2633,0,1,0.378,bodule,0,1,,0.378
3011,3126,1,1,0.415,remember,1,1,,0.378
2482,2591,1,1,0.425,mellow,1,1,,0.378
2501,2610,0,1,0.430,gluff,0,1,,0.378
2328,2437,0,1,0.435,imversion,0,1,,0.378
...,...,...,...,...,...,...,...,...,...
2762,2850,1,1,1.374,galvanism,1,796,,0.391
156,164,0,1,1.389,Pivlov,0,796,,0.391
1109,1157,0,1,1.419,snop,0,796,,0.391
828,870,0,1,1.434,discredituble,0,796,,0.391


In [24]:
final_df = final_df.drop(["trial", "string_type", "quantile"], axis=1)
 # Adding new particpant ID column for Stan
final_df["participant_id"] = final_df["participant"].replace(ids, list(range(1, len(ids)+1)))
final_df.reset_index(inplace=True, drop=True)

In [29]:
final_df.head()

Unnamed: 0,accuracy,rt,string,response,participant,minRT,participant_id
0,1,0.378,bodule,0,1,0.378,1
1,1,0.415,remember,1,1,0.378,1
2,1,0.425,mellow,1,1,0.378,1
3,1,0.43,gluff,0,1,0.378,1
4,1,0.435,imversion,0,1,0.378,1


### Saving Data

In [30]:
final_df.to_csv(data_output_path, header=0, index=False)