In [1]:
# randomly pick the existing design, self-generated semantic matrix
import pandas as pd
import numpy as np
import pickle

### Load Real Design

In [2]:
df = pd.read_csv('simu1_data/cr_preproc_data_mturk.csv')
df

Unnamed: 0,rt,time_elapsed,subject_ID,item,lag,category,confidence,correct,correct_num,category_label,...,position,old,yes,block_type,prev_cat,prev_cat_match,prev_cat_label,prev_cat_label_match,curr_cat_length,curr_cat_label_length
0,1640.345,71781.0,120,ARM,0,Uncategorized,1.0,True,1,BodyParts,...,0,False,False,Uncategorized,,,,False,,0
1,845.750,71783.0,422,CELLO,0,Uncategorized,5.0,False,0,Instruments,...,0,False,True,Uncategorized,,,,False,,0
2,,72233.0,52,HALLWAY,0,Building,,False,0,Building,...,0,False,,Categorized,,False,,False,0.0,0
3,,72813.0,20,MOUNTAIN,0,Uncategorized,,False,0,Landscapes,...,0,False,,Uncategorized,,,,False,,0
4,,73029.0,108,ROBIN,0,Uncategorized,,False,0,Birds,...,0,False,,Uncategorized,,,,False,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381055,,12140412.0,228,CARROT,163,Vegetables,,False,0,Vegetables,...,575,True,,Categorized,Landscapes,False,Landscapes,False,20.0,20
381056,,12144498.0,228,GOOSE,112,Uncategorized,,False,0,Birds,...,576,True,,Uncategorized,Vegetables,,Vegetables,False,,21
381057,,12148572.0,228,PEAS,68,Vegetables,,False,0,Vegetables,...,577,True,,Categorized,Uncategorized,False,Birds,False,21.0,21
381058,,12152653.0,228,SAW,159,Tools,,False,0,Tools,...,578,True,,Categorized,Vegetables,False,Vegetables,False,22.0,22


In [3]:
# drop redundant columns
df = df.drop(['rt', 'time_elapsed', 'correct', 'correct_num', 'block_type', 'item_name', 'prev_cat', 'prev_cat_match', 'prev_cat_label', 'prev_cat_label_match','curr_cat_length', 'curr_cat_label_length', 'yes', 'confidence', 'category'], axis=1)
df

Unnamed: 0,subject_ID,item,lag,category_label,position,old
0,120,ARM,0,BodyParts,0,False
1,422,CELLO,0,Instruments,0,False
2,52,HALLWAY,0,Building,0,False
3,20,MOUNTAIN,0,Landscapes,0,False
4,108,ROBIN,0,Birds,0,False
...,...,...,...,...,...,...
381055,228,CARROT,163,Vegetables,575,True
381056,228,GOOSE,112,Birds,576,True
381057,228,PEAS,68,Vegetables,577,True
381058,228,SAW,159,Tools,578,True


In [4]:
# drop subject 200!!!
df = df.loc[df.subject_ID != 200]

In [5]:
# how many subjects
subjlist = df.subject_ID.to_numpy()
subjlist = np.unique(subjlist)
len(subjlist)

656

In [6]:
# add itemno
items = np.unique(df.item)
item2no = {}
for i in range(len(items)):
    item2no[items[i]] = i+1
df['itemno'] = df.apply(lambda x: item2no[x.loc['item']], 1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['itemno'] = df.apply(lambda x: item2no[x.loc['item']], 1)


Unnamed: 0,subject_ID,item,lag,category_label,position,old,itemno
0,120,ARM,0,BodyParts,0,False,3
1,422,CELLO,0,Instruments,0,False,48
2,52,HALLWAY,0,Building,0,False,124
3,20,MOUNTAIN,0,Landscapes,0,False,173
4,108,ROBIN,0,Birds,0,False,220
...,...,...,...,...,...,...,...
381055,228,CARROT,163,Vegetables,575,True,43
381056,228,GOOSE,112,Birds,576,True,119
381057,228,PEAS,68,Vegetables,577,True,194
381058,228,SAW,159,Tools,578,True,228


In [7]:
# organize
df = df.sort_values(by = ['subject_ID', 'position'])
df = df.reset_index(drop = True)
df = df[['subject_ID','position', 'item','itemno','category_label', 'lag', 'old']]
df

Unnamed: 0,subject_ID,position,item,itemno,category_label,lag,old
0,0,0,CHICKEN,53,FarmAnimals,0,False
1,0,1,BLUEBERRY,26,Fruit,0,False
2,0,2,BUS,33,Vehicles,0,False
3,0,3,LEMON,151,Fruit,0,False
4,0,4,OYSTER,187,OceanAnimals,0,False
...,...,...,...,...,...,...,...
380475,656,575,COLA,60,Beverages,37,True
380476,656,576,BASEBALL,10,Toys,115,True
380477,656,577,TEA,265,Beverages,24,True
380478,656,578,RADIO,213,Electronics,41,True


### Design

In [8]:
rng = np.random.default_rng(seed=42)
simu_sess_num = 1000

subjectlist = np.unique(df.subject_ID)
sess = rng.choice(subjectlist, simu_sess_num)
sess

array([ 58, 508, 430, 288, 285, 564,  56, 458, 132,  61, 346, 641, 483,
       500, 471, 516, 337,  84, 551, 296, 329, 244, 119, 608, 513, 423,
       264, 540, 358, 291, 296, 149,  60, 364, 583,  41, 564, 543, 181,
       415, 108, 498, 460, 233,  44, 637, 293, 586, 445, 511, 499, 127,
       239, 307, 327,  28, 359, 101, 488, 449, 606, 489, 241, 635, 270,
       214, 595, 244,  50, 309, 522, 124, 304,  85, 451, 313, 217, 148,
       371, 440, 617, 287, 105, 547, 414, 460,  63, 205, 504, 546, 286,
       528, 553, 255, 590, 189, 157, 448, 418,  91, 547, 131, 529,   4,
       523, 517, 512, 437, 310, 463, 181, 513, 365, 302, 332, 374,  24,
        91, 161,  75, 289, 439, 430, 310, 561, 371,  51, 502, 377, 417,
       371, 364,  59, 367, 522, 199, 396,  20, 228, 287, 645, 140, 181,
       268, 652, 560,  22, 153, 540,  38, 562, 184, 602, 192, 285, 435,
        82, 366, 332, 515, 654, 436, 269, 267, 275, 534, 211, 109, 220,
        14,  69,  59, 507, 474, 458, 303, 471, 105, 591, 329, 61

In [9]:
df_test = pd.DataFrame()
for i in range(len(sess)):
    tmp = df.loc[df.subject_ID == sess[i],:].copy()
    tmp["session"] = i
    df_test = pd.concat([df_test, tmp])
df_test = df_test.reset_index(drop=True)
df_test['study_itemno1'] = df_test['itemno']
df_test['study_itemno2'] = -1
df_test['test_itemno1'] = df_test['itemno']
df_test['test_itemno2'] = -1
df_test

Unnamed: 0,subject_ID,position,item,itemno,category_label,lag,old,session,study_itemno1,study_itemno2,test_itemno1,test_itemno2
0,58,0,UNDERWEAR,282,Clothing,0,False,0,282,-1,282,-1
1,58,1,JEANS,139,Clothing,0,False,0,139,-1,139,-1
2,58,2,JACKET,138,Clothing,0,False,0,138,-1,138,-1
3,58,3,TABLET,263,Electronics,0,False,0,263,-1,263,-1
4,58,4,COMPUTER,62,Electronics,0,False,0,62,-1,62,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
579995,337,575,LAKE,148,Landscapes,0,False,999,148,-1,148,-1
579996,337,576,PIE,201,Desserts,78,True,999,201,-1,201,-1
579997,337,577,LOBSTER,160,OceanAnimals,90,True,999,160,-1,160,-1
579998,337,578,MAPLE,161,Trees,118,True,999,161,-1,161,-1


In [10]:
# save
with open('simu1_data/simu1_design.pkl', 'wb') as outp:
    pickle.dump(df_test, outp, pickle.HIGHEST_PROTOCOL)