In [1]:
import pandas as pd, numpy as np, json, pickle, os, warnings
from glob import glob
from datetime import datetime

from socialbrainapp import get_demographics, get_oci, get_sds, get_lsas, get_hardball, get_hardball_ratings, get_journey
warnings.simplefilter(action='ignore', category=FutureWarning)

# Parsing

In [2]:
# Get today's date
todays_date = datetime.now().strftime("%Y%m%d")

# Load json data
json_dir = '../json'
out_dir = '../data/runs'
json_files = glob(f'{json_dir}/*json')
print(f'{len(json_files)} jsons to parse')

25 jsons to parse


In [3]:
# Parse json data
id_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[], 
            'Journey_decisions':[], 'Journey_memory':[], 'Journey_dots':[], 'Journey_characters':[]}

df_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[],
            'Journey_decisions':[], 'Journey_memory':[], 'Journey_dots':[], 'Journey_characters':[]}

for json_file in json_files:
    
    try:
        with open(json_file, 'rb') as handle:
            tmp_data = json.load(handle)

        for element in tmp_data:

            # Get Demographics
            if 'Age' in element.keys():
                id_lists['Demographics'] += [element['UserId']]
                df_lists['Demographics'] += [get_demographics(element)]

            # Get Survey Data
            if 'SurveyName' in element.keys():

                # Get OCI
                if element['SurveyName'] == 'OCI':
                    id_lists['OCI'] += [element['UserId']]
                    df_lists['OCI'] += [get_oci(element)]

                # Get SDS
                elif element['SurveyName'] == 'SDS':
                    id_lists['SDS'] += [element['UserId']]
                    df_lists['SDS'] += [get_sds(element)]

                # Get LSAS
                elif element['SurveyName'] == 'LSAS':
                    id_lists['LSAS'] += [element['UserId']]
                    df_lists['LSAS'] += [get_lsas(element)]

            # Get Game Data
            if 'Game' in element.keys():
                
                # Get Hardball
                if element['Game'] == 'Hardball':
                    if 'Screen' not in element.keys():
                        id_lists['Hardball'] += [element['UserId']]
                        df_lists['Hardball'] += [get_hardball(element)]

                    if 'Screen' in element.keys():
                        id_lists['HardballSubjectiveRatings'] += [element['UserId']]
                        df_lists['HardballSubjectiveRatings'] += [get_hardball_ratings(element)]

                # Get Journey
                if element['Game'] == 'Journey':
                    
                    try: 
                        task_name, snt_df = get_journey(element)
                        id_lists['Journey_'+task_name] += [element['UserId']]
                        df_lists['Journey_'+task_name] += [snt_df]
                        
                    except: # not interested in some trials but want a better solution here
                        continue

    except:
        continue

# concatenate all df lists
full_dfs = {}
for (key, value) in df_lists.items():
    if len(value) > 0: 
        full_dfs[key] = []
        full_dfs[key] = pd.concat(value)

# Preprocessing

In [4]:
# Preprocess Data
preproc_dfs = {}

## Hardball

In [5]:
# remove Hardball subjects with less than 60 completed trials
preproc_dfs['Hardball'] = full_dfs['Hardball'].copy()
preproc_dfs['HardballSubjectiveRatings'] = full_dfs['HardballSubjectiveRatings'].copy()

for subj_id in np.unique(full_dfs['Hardball'].index):
    ntrials = len(full_dfs['Hardball'][full_dfs['Hardball'].index==subj_id])
    if ntrials < 60:
        preproc_dfs['Hardball'] = preproc_dfs['Hardball'].drop(subj_id)
        try:
            preproc_dfs['HardballSubjectiveRatings'] = preproc_dfs['HardballSubjectiveRatings'].drop(subj_id)
        except:
            continue
    elif ntrials > 60:
        print(subj_id, f'{ntrials} trials')
        preproc_dfs['Hardball'].at[subj_id, 'NTrials'] = ntrials
        preproc_dfs['Hardball'].at[subj_id, 'Include'] = np.nan
    else:
        preproc_dfs['Hardball'].at[subj_id, 'NTrials'] = ntrials
        preproc_dfs['Hardball'].at[subj_id, 'Include'] = 1

preproc_dfs['HardballSubjectiveRatings'].to_csv(f'{out_dir}/HardballSubjectiveRatings-data-{todays_date}.csv', index_label='SubjectID')

# combine dfs
text_to_display = []
preproc_dfs['Hardball'] = preproc_dfs['Hardball'].reset_index()

for idx in preproc_dfs['Hardball'].index:
    subj_id = preproc_dfs['Hardball']['index'][idx]
    team_id = preproc_dfs['Hardball']['TeamName'][idx]
    year_id = preproc_dfs['Hardball']['Year'][idx]
    month_id = preproc_dfs['Hardball']['Month'][idx]
    day_id = preproc_dfs['Hardball']['Day'][idx]

    try:
        influence_rating = float(preproc_dfs['HardballSubjectiveRatings'][(preproc_dfs['HardballSubjectiveRatings']['TeamName']==team_id) & (preproc_dfs['HardballSubjectiveRatings']['Year']==year_id) & (preproc_dfs['HardballSubjectiveRatings']['Month']==month_id) & (preproc_dfs['HardballSubjectiveRatings']['Day']==day_id)].at[subj_id, 'Rate'])
        preproc_dfs['Hardball'].at[idx, 'InfluenceRating'] = influence_rating
    except:
        if f'No influence rating for {subj_id}' not in text_to_display:
            text_to_display += [f'No influence rating for {subj_id}']
            print(f'No influence rating for {subj_id}')
        continue

preproc_dfs['Hardball'] = preproc_dfs['Hardball'].set_index('index')
preproc_dfs['Hardball'].to_csv(f'{out_dir}/Hardball-data-{todays_date}.csv', index_label='SubjectID')


309399CE-63A7-472E-8619-65576780E6A2 180 trials
405F5765-F99C-4A61-9D21-55C4BCD818B0 94 trials
6CC2F911-589F-40F5-B27E-460291076521 120 trials
8312DACF-5F5F-4BEC-9AC8-2F3CE0D0C47E 82 trials
969B718B-CD49-47E2-BA31-56643220BC84 120 trials
D832F3A0-C39F-4BA5-9F13-4C1FD8966159 120 trials
No influence rating for 969B718B-CD49-47E2-BA31-56643220BC84
No influence rating for 309399CE-63A7-472E-8619-65576780E6A2
No influence rating for 6CC2F911-589F-40F5-B27E-460291076521
No influence rating for 405F5765-F99C-4A61-9D21-55C4BCD818B0
No influence rating for 8312DACF-5F5F-4BEC-9AC8-2F3CE0D0C47E


## Journey

In [6]:
for journey in ['Journey_decisions', 'Journey_memory', 'Journey_characters']:
    df = full_dfs[journey].copy().reset_index()
    preproc_dfs[journey] = df.rename(columns={'index': 'sub_id'})

# remove journey subjects with less than 63 completed trials
snt = []
ver = []
mem = []
for sub_id in np.unique(full_dfs['Journey_decisions'].index):
    
    ntrials = len(full_dfs['Journey_decisions'][full_dfs['Journey_decisions'].index == sub_id])
    
    # preprocess anyone w/ 63 or more trials 
    if ntrials >= 63:
        
        sub_df = full_dfs['Journey_decisions'][full_dfs['Journey_decisions'].index == sub_id].copy()
        sub_df['Num_trials'] = ntrials
        sub_df['Include'] = 1
        
        # if two trials are > 30 minutes apart define them as separate sessions...
        s = 1
        sessions = [[1, 0]]
        datetimes = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S.%f") for dt in sub_df['Datetime'].values]
        for d, dt in enumerate(datetimes[1:]):
            elapsed = dt - datetimes[d]
            if elapsed.seconds > (60 * 30): 
                s += 1
            sessions.append([s, elapsed.seconds])
        sub_df[['Session', 'Elapsed(s)']] = sessions
        
        # count number of repeats for each decision
        counts = sub_df['decision_num'].value_counts()
        sub_df['Num_repeats'] = [counts[c] for c in sub_df['decision_num'].values]
        
        # put together
        sub_ver = full_dfs['Journey_characters'][full_dfs['Journey_characters'].index == sub_id].copy()
        if len(sub_ver) > 1: 
            print(sub_id + ' multiple task versions')
            sub_df['Mult_versions'] = 1
        else:
            sub_df['Mult_versions'] = 0
            
        snt.append(sub_df)
        ver.append(sub_ver)
        mem.append(full_dfs['Journey_memory'][full_dfs['Journey_memory'].index == sub_id].copy())

# output
pd.concat(snt).to_csv(f'{out_dir}/SNT_data_{todays_date}.csv', index_label='sub_id')
pd.concat(mem).to_csv(f'{out_dir}/SNT-memory_data_{todays_date}.csv', index_label='sub_id')
pd.concat(ver).to_csv(f'{out_dir}/SNT-ver_data_{todays_date}.csv', index_label='sub_id')

# TO DO: output role of the characters selected in memory by referencing ver 

68a59a4a271b255df02ddff89ceeff0b multiple task versions


## Demographics + Questionnaires

In [7]:
full_dfs['OCI'].to_csv(f'{out_dir}/OCI-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove OCI subjects with less than 19 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['OCI'].index):
    subj_df = full_dfs['OCI'].loc[subj_id]
    if len(subj_df) == 19: # completed survey once
        # grab attention check
        attn_check = float(subj_df[subj_df['SurveyQuestion']==15]['Response'])

        # long to wide, exclude attention check
        subj_df = subj_df[subj_df['SurveyQuestion']!=15].pivot(columns='SurveyQuestion', values='Response').add_prefix('OCI_')

        subj_df['OCI_Total'] = subj_df.sum(axis=1, skipna=False)

        if np.isnan(attn_check):
            subj_df['OCI_AttnCheck'] = 1
        else:
            subj_df['OCI_AttnCheck'] = 0

        tmp_dflist += [subj_df]
    elif len(subj_df) > 19:
        print(subj_id, len(subj_df))

preproc_dfs['OCI'] = pd.concat(tmp_dflist)
preproc_dfs['OCI'].to_csv(f'{out_dir}/OCI-data-{todays_date}.csv', index_label='SubjectID')

AECC8FE9-F873-4EC2-B4BA-40386726DC33 41


In [8]:
full_dfs['SDS'].to_csv(f'{out_dir}/SDS-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove SDS subjects with less than 21 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['SDS'].index):
    subj_df = full_dfs['SDS'].loc[subj_id]
    if len(subj_df) == 21: # completed survey once
        # grab attention check
        attn_check = float(subj_df[subj_df['SurveyQuestion']==16]['Response'])

        # long to wide, exclude attention check
        subj_df = subj_df[subj_df['SurveyQuestion']!=16].pivot(columns='SurveyQuestion', values='Response').add_prefix('SDS_')

        subj_df['SDS_Total'] = subj_df.sum(axis=1, skipna=False)

        if np.isnan(attn_check):
            subj_df['SDS_AttnCheck'] = 1
        else:
            subj_df['SDS_AttnCheck'] = 0

        tmp_dflist += [subj_df]
    elif len(subj_df) > 21:
        print(subj_id, len(subj_df))

preproc_dfs['SDS'] = pd.concat(tmp_dflist)
preproc_dfs['SDS'].to_csv(f'{out_dir}/SDS-data-{todays_date}.csv', index_label='SubjectID')

2FCBC76A-8621-427C-9197-3B0241D06206 22
405F5765-F99C-4A61-9D21-55C4BCD818B0 1701


In [9]:
full_dfs['LSAS'].to_csv(f'{out_dir}/LSAS-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove LSAS subjects with less than 24 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['LSAS'].index):
    subj_df = full_dfs['LSAS'].loc[subj_id]
    if len(subj_df) == 24: # completed survey once
        # long to wide, exclude attention check
        subj_df = subj_df.pivot(columns='SurveyQuestion', values='Response').add_prefix('LSAS_')

        subj_df['LSAS_Total'] = subj_df.sum(axis=1, skipna=False)

        tmp_dflist += [subj_df]
    elif len(subj_df) > 24:
        print(subj_id, len(subj_df))

preproc_dfs['LSAS'] = pd.concat(tmp_dflist)
preproc_dfs['LSAS'].to_csv(f'{out_dir}/LSAS-data-{todays_date}.csv', index_label='SubjectID')

1F3D051E-8385-4C5A-9627-B30383846268 25
20386890-DC58-451B-95D4-D8873C33D195 26
309399CE-63A7-472E-8619-65576780E6A2 31
405F5765-F99C-4A61-9D21-55C4BCD818B0 474
68a59a4a271b255df02ddff89ceeff0b 26


In [10]:
# save demographics
preproc_dfs['Demographics'] = full_dfs['Demographics'].copy()
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['LSAS'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['OCI'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['SDS'])
preproc_dfs['Demographics'].to_csv(f'{out_dir}/Demographics-data-{todays_date}.csv', index_label='SubjectID')

In [11]:
# save as pickle
with open(f'{out_dir}/SocialBrainAppData-{todays_date}.pickle', 'wb') as handle:
    pickle.dump(preproc_dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# move files after preprocessing is completed
for json_file in json_files:
    os.rename(json_file, f'{json_dir}/processed/{os.path.basename(json_file)}')

In [13]:
print('Number of New Unique IDs with Any Data')
for name, id_list in id_lists.items():
    print(name, len(np.unique(id_list)))

Number of New Unique IDs with Any Data
Demographics 28
Hardball 23
HardballSubjectiveRatings 20
OCI 22
SDS 23
LSAS 14
Journey_decisions 32
Journey_memory 31
Journey_dots 0
Journey_characters 33


In [14]:
print('Number of New Unique IDs with Complete Data')
for name, id_list in preproc_dfs.items():
    print(name, len(np.unique(id_list.index)))

Number of New Unique IDs with Complete Data
Hardball 20
HardballSubjectiveRatings 20
Journey_decisions 1120
Journey_memory 328
Journey_characters 37
OCI 21
SDS 20
LSAS 9
Demographics 28


# Append new data to Master 

In [15]:
# append to Master csvs
preproc_dfs['Hardball'] = preproc_dfs['Hardball'].rename_axis('SubjectID')
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].rename_axis('SubjectID')

master_dfs = {'Hardball': pd.read_csv('../data/Hardball-data-Master.csv', index_col='SubjectID'), 
              'Demographics': pd.read_csv('../data/Demographics-data-Master.csv', index_col='SubjectID')}

for subj_id in list(np.unique(preproc_dfs['Hardball'].index)):
    if subj_id in list(np.unique(master_dfs['Hardball'].index)):
        print(subj_id)
        preproc_dfs['Hardball'].loc[subj_id,'Include'] = 0.0
        preproc_dfs['Hardball'].loc[subj_id,'Notes'] = 'Participated previously'

master_dfs['Hardball'] = master_dfs['Hardball'].append(preproc_dfs['Hardball'])
master_dfs['Hardball'].to_csv('../data/Hardball-data-Master2.csv', index_label='SubjectID')

master_dfs['Demographics'] = master_dfs['Demographics'].append(preproc_dfs['Demographics'])
master_dfs['Demographics'].to_csv('../data/Demographics-data-Master2.csv', index_label='SubjectID')

8312DACF-5F5F-4BEC-9AC8-2F3CE0D0C47E
A428C902-3440-4AA8-9617-56BA7D16277D
E98C656F-137B-4603-9934-7697BF73323D


In [16]:
print('Total Number of Unique IDs with Complete Data')
for name, master_df in master_dfs.items():
    if name == 'Hardball':
        print(name, len(np.unique(master_df[master_df.Include==1].index)))
    else:
        print(name, len(np.unique(master_df.index)))

Total Number of Unique IDs with Complete Data
Hardball 122
Demographics 333
