In [15]:
import os
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.special import expit
from bisect import bisect_left

In [273]:
PATHS_TO_CSVS = [
     '/Users/jchjiangcheng/Desktop/RAW/pilot_003_s_vowel_discrimination_task_2023_Jun_06_1151.csv'
     '/Users/jchjiangcheng/Desktop/RAW/pilot_004_s_vowel_discrimination_task_2023_Jun_06_1151.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_004_vowel_discrimination_task_2023_Jun_06_1057.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_007_s_vowel_discrimination_task_2023_Jun_06_1650.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_007_vowel_discrimination_task_2023_Jun_06_1604.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_008_s_vowel_discrimination_task_2023_Jun_07_1006.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_008_vowel_discrimination_task_2023_Jun_07_0917.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_009_s_vowel_discrimination_task_2023_Jun_07_1125.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_009_vowel_discrimination_task_2023_Jun_07_1051.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_011_s_vowel_discrimination_task_2023_Jun_07_1506.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_011_vowel_discrimination_task_2023_Jun_07_1428.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_012_s_vowel_discrimination_task_2023_Jun_07_1635.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_012_vowel_discrimination_task_2023_Jun_07_1601.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_014_s_vowel_discrimination_task_2023_Jun_08_1118.csv',
     '/Users/jchjiangcheng/Desktop/RAW/pilot_014_vowel_discrimination_task_2023_Jun_08_1046.csv'
]

In [275]:
def formant_extractor(vowel_vec):
    '''
    Function to retrieve the f1 values from the stimuli string
    
    INPUTS:
    vowel_vec:      A Pandas Series with the stimuli string containing the F1 value of the oa vowel

    OUPUT:
    f1_series:      A Panda Series with the F1 values as integers
    '''
    # make sure there are no NaNs
    vowel_vec.dropna(inplace=True)
    # init list of F1 values
    f1_values = []
    # loop through input series and extract F1 values
    for i in range(len(vowel_vec)):
        vowel_string = vowel_vec.iloc[i]
        vowel_string_split = vowel_string.split('_')
        f1_value = int(vowel_string_split[-1][:-4])
        f1_values.append(f1_value)
    f1_series = pd.Series(f1_values)

    return f1_series

def session_data_extractor(ppt_df, session_n):
    '''
    Function to retrieve the data from a given session from a participant's data
    
    INPUTS:
    ppt_df:         A Pandas DataFrame from the raw .csv data from PsychoPy
    session_n:      (Integer) The session number to extract data from

    OUPUT:
    session_n_df:   A Pandas DataFrame with the F1 values, the participant's responses and response times from session n
    '''
    # get series for F1 info, response key and response time
    session_n_str = str(session_n)
    session_n_vowel_vec = ppt_df['session_'+session_n_str+'_vowel']
    session_n_resp_vec = ppt_df['session_'+session_n_str+'_key_resp.keys']
    session_n_resp_time_vec = ppt_df['session_'+session_n_str+'_key_resp.rt']

    # drop NaNs and reset index
    session_n_vowel_vec = session_n_vowel_vec.iloc[session_n_vowel_vec.first_valid_index():session_n_vowel_vec.first_valid_index()+300]
    session_n_resp_vec = session_n_resp_vec.iloc[session_n_resp_vec.first_valid_index():session_n_resp_vec.first_valid_index()+300]
    session_n_resp_time_vec = session_n_resp_time_vec.iloc[session_n_resp_time_vec.first_valid_index():session_n_resp_time_vec.first_valid_index()+300]
    session_n_vowel_vec.reset_index(drop=True, inplace=True); session_n_resp_vec.reset_index(drop=True, inplace=True);session_n_resp_time_vec.reset_index(drop=True, inplace=True)

    # get new series with F1 values as integers
    session_n_f1_vec = formant_extractor(session_n_vowel_vec)

    # get new series with response keys as codes
    # f = 1; j = 0
    session_n_resp_int_vec = session_n_resp_vec.replace(to_replace='f', value=1)
    session_n_resp_int_vec.replace(to_replace='j', value=0, inplace=True)

    session_n_df = pd.concat([session_n_f1_vec, session_n_resp_vec, session_n_resp_int_vec, session_n_resp_time_vec], axis=1)
    session_n_df.columns = ['session_'+session_n_str+'_f1_value',
                            'session_'+session_n_str+'_response_key',
                            'session_'+session_n_str+'_response_code',
                            'session_'+session_n_str+'_response_time']
    return session_n_df

def all_sessions_data_extractor(ppt_df, n_sessions):
    for i in np.arange(1, n_sessions+1):
        if i == 1:
            session_1_df = session_data_extractor(ppt_df, i)
            ppt_processed_df = session_1_df
        else:
            session_i_df = session_data_extractor(ppt_df, i)
            ppt_processed_df = pd.concat([ppt_processed_df, session_i_df], axis=1)
    return ppt_processed_df

def raw_to_processed_df(ppt_df, n_sessions):
    # get ppt meta data
    ppt_id = ppt_df.participant.iloc[0]
    # get session data
    session_df = all_sessions_data_extractor(ppt_df, n_sessions)
    # get number of observations
    n_observations = session_df.shape[0]
    # concat meta-data and session data
    processed_df = pd.concat([
        pd.Series(np.full(n_observations, ppt_id)),
        session_df     
    ], axis=1)
    processed_df.rename({0: 'ppt_id', 1: 'age',
                        2: 'gender', 3: 'language'}, axis=1, inplace=True)
    return processed_df

In [276]:
processed_df_i = raw_to_processed_df(ex_ppt_df, 3)
processed_df_i

Unnamed: 0,ppt_id,session_1_f1_value,session_1_response_key,session_1_response_code,session_1_response_time,session_2_f1_value,session_2_response_key,session_2_response_code,session_2_response_time,session_3_f1_value,session_3_response_key,session_3_response_code,session_3_response_time
0,,622,j,0,0.489084,596,f,1,0.407349,717,f,1,-2.009659
1,,443,j,0,0.553610,495,j,0,0.554527,596,j,0,-0.677834
2,,627,f,1,0.358859,662,f,1,0.584096,637,j,0,-0.361374
3,,640,f,1,0.340282,629,f,1,0.372182,522,f,1,0.107265
4,,471,j,0,0.434034,690,f,1,0.553764,553,f,1,0.339092
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,,624,f,1,0.350034,639,j,0,0.312153,441,f,1,-0.652432
296,,544,f,1,0.164075,580,j,0,0.335101,445,j,0,-0.679995
297,,699,j,0,0.370329,602,j,0,0.451840,665,f,1,-0.434239
298,,574,f,1,0.486005,608,f,1,0.822743,701,j,0,-0.079045


In [277]:
#create a new dataframe with combined columns
df = processed_df_i
RT, code, F1, Session, ID = [], [], [], [], [] 
for sess in range(1, 4):
    num = len(df[f"session_{sess}_f1_value"])
    RT.extend(df[f"session_{sess}_response_time"])
    code.extend(df[f"session_{sess}_response_code"])
    F1.extend(df[f"session_{sess}_f1_value"])
    Session.extend([sess] * num)

new_df = {}
new_df["RT"] = RT
new_df["code"] = code
new_df["F1"] = F1
new_df["Session"] = Session    
new_df = pd.DataFrame(data=new_df)
df = new_df
df

Unnamed: 0,RT,code,F1,Session
0,0.489084,0,622,1
1,0.553610,0,443,1
2,0.358859,1,627,1
3,0.340282,1,640,1
4,0.434034,0,471,1
...,...,...,...,...
895,-0.652432,1,441,3
896,-0.679995,0,445,3
897,-0.434239,1,665,3
898,-0.079045,0,701,3


In [278]:
#clean the data
df = df[df["RT"] > 0]
df = df[df["code"] != 'None']
mean, std = df["F1"].mean(), df["F1"].std()
df = df[(df["F1"] - mean) / std <= 2]
df = df[(df["F1"] - mean) / std >= -2]
df.to_csv('/Users/jchjiangcheng/Desktop/processed/Cleaned data/pilot with stimulation/003s.csv', index=False)
df

Unnamed: 0,RT,code,F1,Session
0,0.489084,0,622,1
1,0.553610,0,443,1
2,0.358859,1,627,1
3,0.340282,1,640,1
4,0.434034,0,471,1
...,...,...,...,...
862,0.043096,1,649,3
863,0.109587,1,682,3
864,0.021226,1,635,3
865,0.031503,1,636,3


In [279]:
def get_stats(df):
    xs, ys = list(df["F1"]), list(df["code"])
    merged = sorted([(xs[i], ys[i]) for i in range(len(xs))], key=lambda t: t[0])
    xs, ys = np.array([t[0] for t in merged]), np.array([t[1] for t in merged])
    xs = xs.reshape(-1, 1)
    clf = LogisticRegression()
    clf.fit(xs, ys)
    loss = expit(xs * clf.coef_ + clf.intercept_).ravel()
    boundary = xs[bisect_left(loss, 0.5)-1][0] 
    return xs, ys, loss, clf, boundary

In [280]:
#overall perceptual boundary (across sessions)
xs, ys, loss, clf, boundary = get_stats(df)
boundary

559

In [281]:
df1, df2 = pd.read_csv('/Users/jchjiangcheng/Desktop/processed/Cleaned data/pilot with stimulation/12s_new.csv'), pd.read_csv('/Users/jchjiangcheng/Desktop/processed/Cleaned data/pilot with stimulation/12_new.csv')

In [271]:
all(df1 == df2)

True