In [6]:
import csv
from collections import defaultdict
from constants import *
import pandas as pd
from statsmodels.stats.proportion import proportion_confint   

In [2]:
valid_users = set()
user_train_count = defaultdict(int)     # number of training grids completed for each user
user_test_count = defaultdict(int)      # number of test grids completed for each user
user_train_score = defaultdict(int)     # cumulative training score for each user
user_test_timeouts = defaultdict(set)   # test grids the user timed out on for each user

with open('data/insight_data_raw.csv', encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['step'] == '6':          # step 6 contains metadata for the trial as a whole including total reward
            user = row['userid']
            valid_users.add(user)
            if row['type'] == "train":
                user_train_count[user] += 1
                user_train_score[user] += int(row['reward_cum'])
            elif row['type'] == "test":
                user_test_count[user] += 1
               
# exclude users who did not complete all training and test grids, scored less than 0 points in training
def include_user(user):
    return not (user_train_count[user]!=60 or user_test_count[user]!=50 or user_train_score[user]<0)

print("Number of users before exclusions: " + str(len(valid_users)))       
valid_users = set(filter(include_user, valid_users))
print("Number of users after exclusions: " + str(len(valid_users)))

Number of users before exclusions: 364
Number of users after exclusions: 255


In [3]:
df = pd.read_csv('data/insight_data_raw.csv', encoding='utf-8-sig') 
df['userid'] = df['userid'].apply(str)
filtered = df[(df['userid'].isin(valid_users)) & (df['step']==6) & (df['type']=='train')]
filtered['correct'] = filtered['reward_cum'] == filtered['reward_max']
training_accuracy = filtered['correct'].mean()
print("training accuracy", training_accuracy)

training accuracy 0.773202614379085


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['correct'] = filtered['reward_cum'] == filtered['reward_max']


In [4]:
def label(x):
    if x < 100: return "filler"
    if 100 < x < 200: return "Push"
    if 200 < x < 300: return "Switch"
    if 300 < x < 400: return "Switch Control"
    if 400 < x: return "Push Control"
test = df[(df['userid'].isin(valid_users)) & (df['step']==6) & (df['type']=='test')]
test['gridtype'] = test['gridnum'].apply(label)
test['trial'] = test.groupby(['gridtype','userid']).cumcount()+1
test['score_dif'] = test['reward_cum'] - test['reward_max'] + 1
test['userid'] = test['userid'].astype(int)
push = test[(test['gridtype'] == 'Push')][['trial','score_dif', 'userid', 'gridtype']]
push.to_csv('data/insight_push_data_coded.csv')
switch = test[(test['gridtype'] == 'Switch')][['trial','score_dif', 'userid', 'gridtype']]
switch.to_csv('data/insight_switch_data_coded.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['gridtype'] = test['gridnum'].apply(label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['trial'] = test.groupby(['gridtype','userid']).cumcount()+1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['score_dif'] = test['reward_cum'] - test['reward_max'] + 1
A value is trying to be set 

In [24]:
test.loc['correct'] = test['reward_cum'] == test['reward_max']

def get_mean_ci(gridtype):
    correct = test.loc[test['gridtype']==gridtype]['correct'].sum()
    total = test.loc[test['gridtype']==gridtype]['correct'].shape[0]
    mean = correct/total
    ci = proportion_confint(count=correct,    # Number of "successes"
                    nobs=total,    # Number of trials
                    alpha=(1 - 0.95))
    print(gridtype, mean,ci)

get_mean_ci('Push')
get_mean_ci('Switch')
get_mean_ci('Push Control')
get_mean_ci('Switch Control')

Push 0.625 (0.6039918093575691, 0.6460081906424309)
Switch 0.9450980392156862 (0.9352132965863618, 0.9549827818450107)
Push Control 0.9397058823529412 (0.9293766879309451, 0.9500350767749373)
Switch Control 0.9598039215686275 (0.9512804579883385, 0.9683273851489165)


In [16]:
test

Unnamed: 0,id,userid,trialnum,gridnum,type,timed,step,action,reaction_millis,reward_step,...,hit1,hit2,get1,get2,state,timestamp,gridtype,trial,score_dif,correct
6299,10644,3,61,2,test,,6,,,,...,,,,,,2023-06-20 14:36:10,filler,1,1.0,True
6611,10956,3,62,5,test,,6,,,,...,,,,,,2023-06-20 14:36:20,filler,2,1.0,True
6953,11298,3,63,1,test,,6,,,,...,,,,,,2023-06-20 14:36:31,filler,3,1.0,True
7271,11616,3,64,3,test,,6,,,,...,,,,,,2023-06-20 14:36:42,filler,4,0.0,False
7589,11934,3,65,6,test,,6,,,,...,,,,,,2023-06-20 14:36:52,filler,5,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228623,232968,363,106,405,test,,6,,,,...,,,,,,2023-06-21 20:00:45,Push Control,7,1.0,True
228629,232974,363,107,104,test,,6,,,,...,,,,,,2023-06-21 20:00:58,Push,7,1.0,True
228635,232980,363,108,403,test,,6,,,,...,,,,,,2023-06-21 20:01:10,Push Control,8,1.0,True
228641,232986,363,109,17,test,,6,,,,...,,,,,,2023-06-21 20:01:21,filler,18,1.0,True
