In [12]:
# randomly pick the existing design, self-generated semantic matrix
import pandas as pd
import numpy as np
import pickle

### Load Real Design

In [13]:
df = pd.read_csv('simu1_data/cr_preproc_data_mturk.csv')
df

Unnamed: 0,rt,time_elapsed,subject_ID,item,lag,category,confidence,correct,correct_num,category_label,...,position,old,yes,block_type,prev_cat,prev_cat_match,prev_cat_label,prev_cat_label_match,curr_cat_length,curr_cat_label_length
0,1640.345,71781.0,120,ARM,0,Uncategorized,1.0,True,1,BodyParts,...,0,False,False,Uncategorized,,,,False,,0
1,845.750,71783.0,422,CELLO,0,Uncategorized,5.0,False,0,Instruments,...,0,False,True,Uncategorized,,,,False,,0
2,,72233.0,52,HALLWAY,0,Building,,False,0,Building,...,0,False,,Categorized,,False,,False,0.0,0
3,,72813.0,20,MOUNTAIN,0,Uncategorized,,False,0,Landscapes,...,0,False,,Uncategorized,,,,False,,0
4,,73029.0,108,ROBIN,0,Uncategorized,,False,0,Birds,...,0,False,,Uncategorized,,,,False,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381055,,12140412.0,228,CARROT,163,Vegetables,,False,0,Vegetables,...,575,True,,Categorized,Landscapes,False,Landscapes,False,20.0,20
381056,,12144498.0,228,GOOSE,112,Uncategorized,,False,0,Birds,...,576,True,,Uncategorized,Vegetables,,Vegetables,False,,21
381057,,12148572.0,228,PEAS,68,Vegetables,,False,0,Vegetables,...,577,True,,Categorized,Uncategorized,False,Birds,False,21.0,21
381058,,12152653.0,228,SAW,159,Tools,,False,0,Tools,...,578,True,,Categorized,Vegetables,False,Vegetables,False,22.0,22


In [14]:
# drop redundant columns
df = df.drop(['rt', 'time_elapsed', 'correct', 'correct_num', 'block_type', 'item_name', 'prev_cat', 'prev_cat_match', 'prev_cat_label', 'prev_cat_label_match','curr_cat_length', 'curr_cat_label_length', 'yes', 'confidence', 'category'], axis=1)
df

Unnamed: 0,subject_ID,item,lag,category_label,position,old
0,120,ARM,0,BodyParts,0,False
1,422,CELLO,0,Instruments,0,False
2,52,HALLWAY,0,Building,0,False
3,20,MOUNTAIN,0,Landscapes,0,False
4,108,ROBIN,0,Birds,0,False
...,...,...,...,...,...,...
381055,228,CARROT,163,Vegetables,575,True
381056,228,GOOSE,112,Birds,576,True
381057,228,PEAS,68,Vegetables,577,True
381058,228,SAW,159,Tools,578,True


In [15]:
# drop subject 200!!!
df = df.loc[df.subject_ID != 200]

In [16]:
# how many subjects
subjlist = df.subject_ID.to_numpy()
subjlist = np.unique(subjlist)
len(subjlist)

656

In [17]:
# add itemno
items = np.unique(df.item)
item2no = {}
for i in range(len(items)):
    item2no[items[i]] = i+1
df['itemno'] = df.apply(lambda x: item2no[x.loc['item']], 1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['itemno'] = df.apply(lambda x: item2no[x.loc['item']], 1)


Unnamed: 0,subject_ID,item,lag,category_label,position,old,itemno
0,120,ARM,0,BodyParts,0,False,3
1,422,CELLO,0,Instruments,0,False,48
2,52,HALLWAY,0,Building,0,False,124
3,20,MOUNTAIN,0,Landscapes,0,False,173
4,108,ROBIN,0,Birds,0,False,220
...,...,...,...,...,...,...,...
381055,228,CARROT,163,Vegetables,575,True,43
381056,228,GOOSE,112,Birds,576,True,119
381057,228,PEAS,68,Vegetables,577,True,194
381058,228,SAW,159,Tools,578,True,228


In [18]:
# organize
df = df.sort_values(by = ['subject_ID', 'position'])
df = df.reset_index(drop = True)
df = df[['subject_ID','position', 'item','itemno','category_label', 'lag', 'old']]
df

Unnamed: 0,subject_ID,position,item,itemno,category_label,lag,old
0,0,0,CHICKEN,53,FarmAnimals,0,False
1,0,1,BLUEBERRY,26,Fruit,0,False
2,0,2,BUS,33,Vehicles,0,False
3,0,3,LEMON,151,Fruit,0,False
4,0,4,OYSTER,187,OceanAnimals,0,False
...,...,...,...,...,...,...,...
380475,656,575,COLA,60,Beverages,37,True
380476,656,576,BASEBALL,10,Toys,115,True
380477,656,577,TEA,265,Beverages,24,True
380478,656,578,RADIO,213,Electronics,41,True


### Design

In [19]:
rng = np.random.default_rng(seed=42)
simu_sess_num = 1000

subjectlist = np.unique(df.subject_ID)
sess = subjectlist
sess

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [20]:
df_test = pd.DataFrame()
for i in range(len(sess)):
    tmp = df.loc[df.subject_ID == sess[i],:].copy()
    tmp["session"] = i
    df_test = pd.concat([df_test, tmp])
df_test = df_test.reset_index(drop=True)
df_test['study_itemno1'] = df_test['itemno']
df_test['study_itemno2'] = -1
df_test['test_itemno1'] = df_test['itemno']
df_test['test_itemno2'] = -1
df_test

Unnamed: 0,subject_ID,position,item,itemno,category_label,lag,old,session,study_itemno1,study_itemno2,test_itemno1,test_itemno2
0,0,0,CHICKEN,53,FarmAnimals,0,False,0,53,-1,53,-1
1,0,1,BLUEBERRY,26,Fruit,0,False,0,26,-1,26,-1
2,0,2,BUS,33,Vehicles,0,False,0,33,-1,33,-1
3,0,3,LEMON,151,Fruit,0,False,0,151,-1,151,-1
4,0,4,OYSTER,187,OceanAnimals,0,False,0,187,-1,187,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
380475,656,575,COLA,60,Beverages,37,True,655,60,-1,60,-1
380476,656,576,BASEBALL,10,Toys,115,True,655,10,-1,10,-1
380477,656,577,TEA,265,Beverages,24,True,655,265,-1,265,-1
380478,656,578,RADIO,213,Electronics,41,True,655,213,-1,213,-1


In [21]:
# save
with open('simu1_data/simu1_design_unique.pkl', 'wb') as outp:
    pickle.dump(df_test, outp, pickle.HIGHEST_PROTOCOL)