# Pre-processing

Here we create the datafile for the logistic regression model with one history predictor. 

(c) Anna-Lena Eckert

In [1]:
import os, sys, glob
import pandas as pd
import operator
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# find and read datafiles. 
def find_datafiles(path): 
    os.chdir(path)
    files = glob.glob('*.csv')
    return files

files = find_datafiles('C:\\Users\\annae\\Dropbox\\PhD\\RDK\\all_data\\all')

print('Subjects N: ', len(files))

dfs = []
for file in files:
    df = pd.read_csv(file)
    sbj_name = file.split('.')[0]
    df['sbj_id'] = sbj_name
    
    # one case where csv is written with ; instad of ,...
    if df.shape[1] == 1: 
        df = pd.read_csv(file, delimiter=';')
        df['sbj_id'] = sbj_name
    
    # make sure incomplete datafiles are not added into final dataframe. 
    if df.shape == (768, 16): 
        dfs.append(df)
    

# Concatenate all data into one DataFrame
all_df = pd.concat(dfs, ignore_index=True)

# make sure no broken datafiles are in there. 
assert all_df.shape[0]/len(dfs) == 96*8
print('Test passed, all datafiles intact. ')

Subjects N:  51
Test passed, all datafiles intact. 


In [3]:
# now we need all sorts of transformations to our datafile in order to get started with the regression models. 
# motion direction: 0 (right) and 1 (left) instead of 0 and 180. 
df = all_df.copy()
df['motionDirection'].replace({180: 1}, inplace=True)
df['response'].replace({180: 1}, inplace=True)

# coherence: 1-6 instead of float numbers, increasing coh = stronger evidence
# We want: 1 (easy), 6 (hard)
# df['coherence'].replace({0.0005: 6, 0.0162: 5, 0.0315: 4, 0.0792: 3, 0.1991: 2, 0.5: 1}, inplace=True)
# df = df.rename(columns={'coherence': 'diff'})

# block coding was reversed compared to auditory task, now 0= rep, 1=neut
df['block_type'].replace({0: 1, 1:0}, inplace=True)

# rename into target column
df = df.rename(columns={'motionDirection':'target'})

# create cue column, where 0= right, 1=left
df['cue'] = np.nan

for index, row in df.iterrows(): 
    if row['cueValid'] == 1: 
        cue = row['target']
        
    elif row['cueValid'] == 0: 
        if row['target'] == 0: 
            cue = 1 # left
        elif row['target'] == 1: 
            cue = 0 #right
            
    df.at[index, 'cue'] = cue

In [4]:
# history column - some functions
def make_nan_col(df, name): 
    df[name] = np.nan

# collect previous k response and make them NaN at beginning of blocks
def collect_history(df, k, columnName, target_col):
     # iterate over df
    for index, row in df.iterrows():
        
        # find previous response and transform to int type
        prev = df[df.index==index-k][columnName].values.astype(int)
        
        # missed responses? and the beginning of blocks should also be at zero. 
        if prev.shape == (0,):
            prev = np.nan
            
        elif prev.shape == (): 
            prev = np.nan
        
        # if block counter incremented (i.e. new block started), it should also be nan. 
        elif df.at[index, 'Block'] != df.at[index-1, 'Block']: 
            prev = np.nan 
        
        # otherwise use previous response...
        else: 
            prev = prev[0]
            
          
        # ...and write it into target column. 
        #target_col = str('resp_%i'%k)
        df.at[index, target_col] = prev

def make_nans_history(df, k):
    
    target_col = 'resp_%i'%k
    resps = [0.0, 1.0]
    
    indexes = []
    
    for index, row in df.iterrows():
    
        if df.at[index, target_col] not in resps:
            indexes.append(index)
            
    for i in indexes: 
    
        if i > 7:
            
            for j in range(1,k): 

                df.at[i+j, target_col] = np.nan
              
    return indexes

def make_nans_stim_hists(df, k):
    
    target_col = 'stim_%i'%k
    resps = [0.0, 1.0]
    
    indexes = []
    
    for index, row in df.iterrows():
    
        if df.at[index, target_col] not in resps:
            indexes.append(index)
            
    for i in indexes: 
    
        if i > 7:
            
            for j in range(1,k): 

                df.at[i+j, target_col] = np.nan
              
    return indexes

In [5]:
# make choice history column with previous t-1 response
# change here to create datafiles for supplementary analyses
# hists = ['resp_%i'%i for i in range(1,8)]

hists = ['resp_1']

for name in hists: 
    make_nan_col(df, name)
    
df = df.reset_index(drop=True)

for k in range(1,2): 
    collect_history(df, k, 'response', 'resp_%i'%k)
    
# make stimulus history column with previous stimulus. 
# to check supplementary analyses, uncomment the list comprehension statement 
# stim_hists = ['stim_%i' % i for i in range(1,8)]

stim_hists = ['stim_1']

for name in stim_hists[0:1]: 
    make_nan_col(df, name)
    
df = df.reset_index(drop=True)

# collect stimulus history ( this part may take long )
for k in range(1,2): 
    collect_history(df, k, 'target', 'stim_%i'%k)
    
# make the first i responses at block beginning = nan
for k in range(1,2):
    make_nans_history(df,k)
    
for k in range(1,2): 
    make_nans_stim_hists(df,k)

In [6]:
# read PPS data. 
# read CAPS and PDI scores. 
os.chdir('C:\\Users\\annae\\Dropbox\\PhD\\RDK\\all_data\\pps\\all')
onlyfiles = [f for f in os.listdir(os.getcwd()) if os.path.isfile(os.path.join(os.getcwd(),f))]
caps_data = [f for f in onlyfiles if f[6:] == '_caps.csv']
pdi_data = [f for f in onlyfiles if f[6:] == '_pdi.csv']

# exclude the dude that only filled out caps
# caps_data.remove('rga09b_caps.csv')

def read_caps(file, sbj_caps): 
    sbj_name = file.split('_')[0]
    
    columns = ['agree', 'distress', 'distract', 'freq']
    caps = pd.read_csv(file,header=None,index_col=0)
    caps.columns = columns
    
    caps_sum = caps.loc['Global'].agree
    caps_distress = caps.loc['Global'].distress
    caps_distract = caps.loc['Global'].distract
    caps_freq = caps.loc['Global'].freq
    
    sbj_caps[sbj_name] = caps_sum
    
    return

sbj_caps = dict()
for file in caps_data:    
    read_caps(file, sbj_caps)
np.mean(list(sbj_caps.values()))

5.085106382978723

In [7]:
def read_pdi(file, sbj_caps): 
    sbj_name = file.split('_')[0]
    
    columns = ['agree', 'distress', 'distract', 'freq']
    pdi = pd.read_csv(file,header=None,index_col=0)
    pdi.columns = columns
    
    pdi_sum = pdi.loc['Global'].agree
    pdi_distress = pdi.loc['Global'].distress
    pdi_distract = pdi.loc['Global'].distract
    pdi_freq = pdi.loc['Global'].freq
    
    sbj_caps[sbj_name] = pdi_sum
    
    return

sbj_pdi = dict()
for file in pdi_data:    
    read_pdi(file, sbj_pdi)
np.mean(list(sbj_pdi.values()))

7.326086956521739

In [8]:
scz_list = [sbj_caps, sbj_pdi] 
scz = pd.DataFrame.from_dict(scz_list)
scz.index=['caps','pdi']
scz = scz.T

In [9]:
# delete from df all of those who did not complete scz instruments
scz = scz.dropna() # 1 person has a NaN value for CAPS - he was fed up and didnt want to fill it. 
sbj_exp = list(df['sbj_id'].unique())
sbj_scz = list(scz.index)
print('Experimental data N: ',len(sbj_exp), '| Survey data N: ',len(sbj_scz))

no_scz = list(set(sbj_exp).difference(set(sbj_scz)))
print('No survey data for subjects: ', no_scz)

Experimental data N:  46 | Survey data N:  46
No survey data for subjects:  ['tgw10b', 'yhy01m', 'yni11l']


In [10]:
for subject in no_scz: 
    df.drop(df[df.sbj_id == subject].index, inplace=True)
    
assert df.shape[0]/ len(list(df['sbj_id'].unique())) == 768

In [11]:
df['caps'] = np.nan
df['pdi'] = np.nan
subjects = list(df['sbj_id'].unique())
scz = scz.T

for index, row in df.iterrows(): 
    sbj_name = row['sbj_id']
    
    if sbj_name in subjects: 
        caps_score = scz[scz.index=='caps'][sbj_name][0]
        pdi_score = scz[scz.index=='pdi'][sbj_name][0]
        
        df.at[index, 'caps'] = int(caps_score)
        df.at[index, 'pdi'] = int(pdi_score)

In [12]:
# add PPS score, sum of z-trans pdi and caps
pps_measures = ['pdi', 'caps']

for measure in pps_measures: 
    col_zscore = measure + '_zscore'
    df[col_zscore] = (df[measure] - df[measure].mean())/df[measure].std(ddof=0)

df['PPS'] = df['pdi_zscore'] + df['caps_zscore']

# z-standardise columns
cols = list(df.columns)
[cols.remove(item) for item in ['sbj_id', 'Trial', 'Block', 'onset_tone', 'onset_rdk', 'end_rdk', 'onset_responseWin', 'key_press', 'pdi_zscore', 'caps_zscore']]

for col in cols: 
    col_z = col + '_z'
    df[col_z] = (df[col]- df[col].mean())/df[col].std(ddof=0)
    
df = df.dropna()

In [13]:
# exclude based on performance criteria from our pre-reg!

subjects = list(df['sbj_id'].unique())
underperformers = []

performance = {}

print('Underperformers: ')

for sbj in subjects: 
    
    df_sbj = df[df['sbj_id']==sbj]
    
    corr = df_sbj.correct.replace(99, 0).sum()
    
    perf = corr / df_sbj.shape[0]
    if perf > 0.9: 
        print(sbj, perf)
    elif perf < 0.6: 
        print(sbj, perf)
        underperformers.append(sbj)
        
    performance[sbj] = perf


for sbj in underperformers: 
    df.drop(df[df.sbj_id == sbj].index, inplace=True)
    
print('Total included: %i'%len(list(df.sbj_id.unique())))

Underperformers: 
ANI05H 0.4644736842105263
HUT15A 0.5631578947368421
Total included: 41


In [14]:
performance.pop('ANI05H',None)
performance.pop('HUT15A',None)

performance[min(performance.keys())]

0.6552631578947369

In [15]:
# save final csv file into correct repo
os.chdir('C:\\Users\\annae\\Desktop\\ChoiceHistory_Psych\\Exp2_visual\\pre-processing')
df.to_csv('model1_visual_final.csv')

df_neut = df[df['block_type']==1]
df_neut.to_csv('model1_visual_neutOnly.csv')

df_rep = df[df['block_type']==0]
df_rep.to_csv('model1_visual_repOnly.csv')

In [3]:
os.chdir('C:\\Users\\annae\\Desktop\\ChoiceHistory_Psych\\Data\\Exp2_visual\\')
df_vis = pd.read_csv('exp2_prevMotor_prevDiff.csv')

In [7]:
# create column for old design participants to check whether influence

subjects = list(df['sbj_id'].unique())

# if capitalized, the subject went through new design (pseudo-randomized)
old_sbj = []
for sbj in subjects: 
    if sbj[0].isupper() == False: 
        old_sbj.append(sbj)
     
df_vis['old'] = 0
for index, row in df_vis.iterrows(): 
    if row['sbj_id'] in old_sbj: 
        df_vis.at[index, 'old'] = 1
        
df_vis['old_z'] = (df_vis['old'] - df_vis['old'].mean())/df_vis['old'].std(ddof=0)
        
df_vis.to_csv('model1_visual_inclOldDesignCol.csv')

In [8]:
df_vis.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Trial,Block,block_type,coherence,target,response,correct,cueValid,response_key,onset_rdk,end_rdk,onset_tone,onset_responseWin,rt,key_press,sbj_id,cue,resp_1,stim_1,caps,pdi,pdi_zscore,caps_zscore,PPS,block_type_z,coherence_z,target_z,response_z,correct_z,cueValid_z,response_key_z,rt_z,cue_z,resp_1_z,stim_1_z,caps_z,pdi_z,PPS_z,motor-1,coh-1,prev_mot_z,prev_coh_z,old,old_z
0,0,1,2,1,0,0.0315,0,0,1,1,0,1629810000.0,1629810000.0,1629810000.0,1629810000.0,2.02419,1629810000.0,ANE14H,0.0,0.0,0.0,2.0,0.0,-1.074209,-0.586776,-1.660985,-1.0,-0.607858,-1.0,-1.02155,0.006189,0.57735,-0.150727,-0.069685,-1.001455,-1.021717,-0.999449,-0.586776,-1.074209,-0.869391,0.0,0.0,-0.153026,-0.788613,0,-0.158304
1,1,2,3,1,0,0.0005,1,0,0,0,0,1629810000.0,1629811000.0,1629810000.0,1629811000.0,1.98846,1629811000.0,ANE14H,0.0,0.0,0.0,2.0,0.0,-1.074209,-0.586776,-1.660985,-1.0,-0.785209,1.0,-1.02155,-0.192638,-1.732051,-0.150727,-0.076908,-1.001455,-1.021717,-0.999449,-0.586776,-1.074209,-0.869391,0.0,0.0315,-0.153026,-0.757113,0,-0.158304
2,2,3,4,1,0,0.0792,1,0,0,1,0,1629811000.0,1629811000.0,1629811000.0,1629811000.0,1.92622,1629811000.0,ANE14H,1.0,0.0,1.0,2.0,0.0,-1.074209,-0.586776,-1.660985,-1.0,-0.334965,1.0,-1.02155,-0.192638,0.57735,-0.150727,-0.089491,0.998548,-1.021717,1.000551,-0.586776,-1.074209,-0.869391,0.0,0.0005,-0.153026,-0.788113,0,-0.158304
3,3,4,5,1,0,0.0005,1,1,1,1,1,1629811000.0,1629811000.0,1629811000.0,1629811000.0,1.76285,1629811000.0,ANE14H,1.0,0.0,1.0,2.0,0.0,-1.074209,-0.586776,-1.660985,-1.0,-0.785209,1.0,0.978905,0.006189,0.57735,0.047505,-0.122518,0.998548,-1.021717,1.000551,-0.586776,-1.074209,-0.869391,0.0,0.0792,-0.153026,-0.709413,0,-0.158304
4,4,5,6,1,0,0.0162,1,0,0,0,0,1629811000.0,1629811000.0,1629811000.0,1629811000.0,1.88462,1629811000.0,ANE14H,0.0,1.0,1.0,2.0,0.0,-1.074209,-0.586776,-1.660985,-1.0,-0.695389,1.0,-1.02155,-0.192638,-1.732051,-0.150727,-0.097901,-1.001455,0.978745,1.000551,-0.586776,-1.074209,-0.869391,1.0,0.0005,0.846974,-0.788113,0,-0.158304


In [22]:
df = pd.read_csv('model1_visual_final.csv')

In [24]:
os.getcwd()

'C:\\Users\\annae\\Desktop\\ChoiceHistory_Psych\\Exp2_visual\\pre-processing'

#### for PMF, we need signed stimulus intensities. 
for index, row in df.iterrows(): 
    
    if row['target'] == 0: 
        coh = -row['coherence']/10
    elif row['target'] == 1: 
        coh = row['coherence']/10
    
    df.at[index, 'coherence'] = coh
    
df.to_csv('visual_pmf_final_TEST.csv')