In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
df = pd.read_table("../Data/RN2_Pix.dat", sep="\s+", names=['subject','session','list','recog_pos','picture','category','study_pos','old_lag','study_lag','confidence','rt'])
df

Unnamed: 0,subject,session,list,recog_pos,picture,category,study_pos,old_lag,study_lag,confidence,rt
0,103,1,0,1,VA084,_OLD_,6,-999,59,6,705
1,103,1,0,2,GBEN253,_OLD_,5,-1,61,6,936
2,103,1,0,3,ONTO011,_OLD_,3,-2,64,6,678
3,103,1,0,4,ISBO1,_NEW_,-99,-999,-9999,4,1461
4,103,1,0,5,PAGETT11,_OLD_,38,-999,31,5,1799
...,...,...,...,...,...,...,...,...,...,...,...
69883,99,1,5,124,CHN639,_NEW_,-99,-999,-9999,4,1460
69884,99,1,5,125,CT173,_OLD_,39,-999,150,2,1040
69885,99,1,5,126,GRTO2,_NEW_,-99,-999,-9999,2,1269
69886,99,1,5,127,GR022,_OLD_,46,-999,145,1,887


In [3]:
# pic to itemno & old
pics = np.unique(df.picture)
pic2itemno = {}
for i in range(len(pics)):
    pic2itemno[pics[i]] = i + 1

itemnos = []
for pic in df.picture:
    itemnos.append(pic2itemno[pic])

df['itemno'] = itemnos
df['old'] = df.category == '_OLD_'

In [4]:
# unique list index
subjectlist = np.sort(np.unique(df.subject))
df['list_uni'] = df.apply(lambda x: np.where(subjectlist == x['subject'])[0].item()*6 + x['list'], axis =1)
df = df.sort_values(by = ['list_uni','recog_pos'])
df

Unnamed: 0,subject,session,list,recog_pos,picture,category,study_pos,old_lag,study_lag,confidence,rt,itemno,old,list_uni
36096,37,1,0,1,CHN211,_OLD_,9,-999,56,6,1053,195,True,0
36097,37,1,0,2,GBEN655,_OLD_,55,46,11,2,2306,493,True,0
36098,37,1,0,3,ECU104,_OLD_,52,-3,15,3,1457,376,True,0
36099,37,1,0,4,MXACP4,_OLD_,47,-5,21,1,2333,998,True,0
36100,37,1,0,5,CHN118,_NEW_,-99,-999,-9999,1,1704,184,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36091,172,1,5,124,JEROCTWL,_NEW_,-99,-999,-9999,2,1076,801,False,545
36092,172,1,5,125,THA064,_NEW_,-99,-999,-9999,6,808,1335,False,545
36093,172,1,5,126,RI003,_OLD_,28,-999,162,4,999,1245,True,545
36094,172,1,5,127,IND117,_NEW_,-99,-999,-9999,2,1114,747,False,545


In [5]:
# organize
df = df.drop(columns=['session','picture','category','confidence','rt','subject','list'])
df = df[['list_uni','recog_pos', 'itemno', 'old', 'old_lag', 'study_pos', 'study_lag']]
df

Unnamed: 0,list_uni,recog_pos,itemno,old,old_lag,study_pos,study_lag
36096,0,1,195,True,-999,9,56
36097,0,2,493,True,46,55,11
36098,0,3,376,True,-3,52,15
36099,0,4,998,True,-5,47,21
36100,0,5,184,False,-999,-99,-9999
...,...,...,...,...,...,...,...
36091,545,124,801,False,-999,-99,-9999
36092,545,125,1335,False,-999,-99,-9999
36093,545,126,1245,True,-999,28,162
36094,545,127,747,False,-999,-99,-9999


### design

In [6]:
rng = np.random.default_rng(seed=42)
simu_sess_num = 1000
simu_old_num = 64
simu_new_num = 64

list_unique = np.unique(df.list_uni)
sess = rng.choice(list_unique, simu_sess_num)
sess

array([ 48, 422, 357, 239, 236, 468,  46, 380, 110,  51, 287, 532, 401,
       415, 391, 429, 280,  69, 458, 245, 273, 202,  99, 506, 426, 351,
       219, 449, 297, 242, 245, 124,  50, 302, 484,  34, 468, 451, 151,
       344,  90, 413, 382, 193,  37, 530, 243, 487, 370, 424, 414, 106,
       198, 254, 271,  23, 298,  84, 405, 372, 503, 406, 200, 528, 224,
       177, 494, 202,  41, 256, 434, 103, 252,  70, 374, 259, 180, 123,
       308, 365, 513, 238,  87, 454, 343, 382,  53, 170, 419, 454, 237,
       439, 459, 211, 490, 157, 130, 372, 347,  76, 454, 109, 439,   4,
       435, 429, 426, 363, 257, 385, 151, 426, 303, 250, 276, 310,  20,
        76, 134,  62, 240, 364, 357, 257, 466, 308,  43, 417, 313, 346,
       308, 302,  49, 305, 434, 165, 329,  16, 189, 238, 536, 117, 150,
       223, 541, 465,  18, 127, 448,  31, 467, 153, 501, 160, 237, 361,
        68, 304, 275, 428, 543, 362, 223, 221, 228, 444, 175,  91, 182,
        12,  57,  49, 421, 394, 380, 252, 391,  88, 491, 273, 51

In [7]:
df_study = pd.DataFrame()
df_test = pd.DataFrame()

for i in range(len(sess)):
    tmp = df.loc[df.list_uni == sess[i]]
    tmp["session"] = i
    
    df_test = df_test.append(tmp)
    
    tmp_study = tmp.loc[tmp.old == True]
    tmp_study = tmp_study.sort_values(by = "study_pos")
    df_study = df_study.append(tmp_study)
    
df_study.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
df_study = df_study.drop(columns=['recog_pos', 'old', 'old_lag','study_lag'])
df_study

Unnamed: 0,list_uni,itemno,study_pos,session
0,48,1328,1,0
1,48,768,2,0
2,48,518,3,0
3,48,21,4,0
4,48,914,5,0
...,...,...,...,...
63995,279,801,60,999
63996,279,997,61,999
63997,279,1186,62,999
63998,279,1279,63,999


In [9]:
df_test

Unnamed: 0,list_uni,recog_pos,itemno,old,old_lag,study_pos,study_lag,session
0,48,1,490,True,-999,52,13,0
1,48,2,541,True,9,61,5,0
2,48,3,1066,False,-999,-99,-9999,0
3,48,4,942,True,-999,34,34,0
4,48,5,1464,False,-999,-99,-9999,0
...,...,...,...,...,...,...,...,...
127995,279,124,530,True,-4,26,162,999
127996,279,125,357,False,-999,-99,-9999,999
127997,279,126,307,False,-999,-99,-9999,999
127998,279,127,1349,False,-999,-99,-9999,999


In [10]:
with open('../Data/simu2_design.pkl', 'wb') as outp:
    pickle.dump(df_study, outp, pickle.HIGHEST_PROTOCOL)
    pickle.dump(df_test, outp, pickle.HIGHEST_PROTOCOL)