In [1]:
import pandas as pd, numpy as np, json, pickle, os, warnings
from glob import glob
from datetime import datetime

from socialbrainapp import get_demographics, get_oci, get_sds, get_lsas, get_hardball, get_hardball_ratings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Get today's date
todays_date = datetime.now().strftime("%Y%m%d")

# Load json data
json_dir = '../json'
out_dir = '../data'
json_files = glob(f'{json_dir}/*json')
len(json_files)

50

In [3]:
# Parse json data
id_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[], 
            'Journey_decisions':[], 'Journey_memory':[], 'Journey_dots':[], 'Journey_characters':[]}

df_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[],
            'Journey_decisions':[], 'Journey_memory':[], 'Journey_dots':[], 'Journey_characters':[]}

for json_file in json_files:
    try:
        with open(json_file, 'rb') as handle:
            tmp_data = json.load(handle)

        for element in tmp_data:

            # Get Demographics
            if 'Age' in element.keys():
                id_lists['Demographics'] += [element['UserId']]
                df_lists['Demographics'] += [get_demographics(element)]

            # Get Survey Data
            if 'SurveyName' in element.keys():

                # Get OCI
                if element['SurveyName'] == 'OCI':
                    id_lists['OCI'] += [element['UserId']]
                    df_lists['OCI'] += [get_oci(element)]

                # Get SDS
                elif element['SurveyName'] == 'SDS':
                    id_lists['SDS'] += [element['UserId']]
                    df_lists['SDS'] += [get_sds(element)]

                # Get LSAS
                elif element['SurveyName'] == 'LSAS':
                    id_lists['LSAS'] += [element['UserId']]
                    df_lists['LSAS'] += [get_lsas(element)]

            # Get Game Data
            if 'Game' in element.keys():
                # Get Hardball
                if element['Game'] == 'Hardball':
                    if 'Screen' not in element.keys():
                        id_lists['Hardball'] += [element['UserId']]
                        df_lists['Hardball'] += [get_hardball(element)]

                    if 'Screen' in element.keys():
                        id_lists['HardballSubjectiveRatings'] += [element['UserId']]
                        df_lists['HardballSubjectiveRatings'] += [get_hardball_ratings(element)]

                # Get Journey
                if element['Game'] == 'Journey':
                    try:
                        task_name, snt_df = get_journey(element)
                        if len(task_name) > 0: 
                            id_lists['Journey_'+task_name] += [element['UserId']]
                            df_lists['Journey_'+task_name] += [snt_df]
                    except:
                        continue

    except:
        continue

# concatenate all df lists
full_dfs = {}
for (key, value) in df_lists.items():
    if len(value) > 0: 
        full_dfs[key] = []
        full_dfs[key] = pd.concat(value)

In [4]:
# Preprocess Data
preproc_dfs = {}

In [5]:
# remove Hardball subjects with less than 60 completed trials
preproc_dfs['Hardball'] = full_dfs['Hardball'].copy()
preproc_dfs['HardballSubjectiveRatings'] = full_dfs['HardballSubjectiveRatings'].copy()

for subj_id in np.unique(full_dfs['Hardball'].index):
    ntrials = len(full_dfs['Hardball'][full_dfs['Hardball'].index==subj_id])
    if ntrials < 60:
        preproc_dfs['Hardball'] = preproc_dfs['Hardball'].drop(subj_id)
        try:
            preproc_dfs['HardballSubjectiveRatings'] = preproc_dfs['HardballSubjectiveRatings'].drop(subj_id)
        except:
            continue
    elif ntrials > 60:
        print(subj_id, f'{ntrials} trials')
        preproc_dfs['Hardball'].at[subj_id, 'NTrials'] = ntrials
        preproc_dfs['Hardball'].at[subj_id, 'Include'] = np.nan
    else:
        preproc_dfs['Hardball'].at[subj_id, 'NTrials'] = ntrials
        preproc_dfs['Hardball'].at[subj_id, 'Include'] = 1

preproc_dfs['HardballSubjectiveRatings'].to_csv(f'{out_dir}/HardballSubjectiveRatings-data-{todays_date}.csv', index_label='SubjectID')

# combine dfs
print('\nCombining DFs')
text_to_display = []
preproc_dfs['Hardball'] = preproc_dfs['Hardball'].reset_index()

for idx in preproc_dfs['Hardball'].index:
    subj_id = preproc_dfs['Hardball']['index'][idx]
    team_id = preproc_dfs['Hardball']['TeamName'][idx]
    year_id = preproc_dfs['Hardball']['Year'][idx]
    month_id = preproc_dfs['Hardball']['Month'][idx]
    day_id = preproc_dfs['Hardball']['Day'][idx]

    try:
        influence_rating = float(preproc_dfs['HardballSubjectiveRatings'][(preproc_dfs['HardballSubjectiveRatings']['TeamName']==team_id) & (preproc_dfs['HardballSubjectiveRatings']['Year']==year_id) & (preproc_dfs['HardballSubjectiveRatings']['Month']==month_id) & (preproc_dfs['HardballSubjectiveRatings']['Day']==day_id)].at[subj_id, 'Rate'])
        preproc_dfs['Hardball'].at[idx, 'InfluenceRating'] = influence_rating
    except:
        if f'No influence rating for {subj_id}' not in text_to_display:
            text_to_display += [f'No influence rating for {subj_id}']
            print(f'No influence rating for {subj_id}')
        continue

preproc_dfs['Hardball'] = preproc_dfs['Hardball'].set_index('index')
# preproc_dfs['Hardball'].to_csv(f'{out_dir}/Hardball-data-{todays_date}.csv', index_label='SubjectID')


01C09A15-F285-4CD1-A1B8-623F60882DE2 120 trials
108E056E-5450-4FCF-9B49-7934ECB62AD6 240 trials
16ED2B73-2D91-4188-95B3-FE78A3D25B5E 61 trials
173C8FF4-084A-4D16-B098-790C15D90CA6 80 trials
242DC553-A15C-4150-8067-5081F6FC073A 180 trials
26a372c5a72ded55db6e41a2d0df363c 180 trials
34D820E8-6CA7-4637-9EE3-EC86C9A7D48D 120 trials
40421E3F-4390-4B6B-B062-AF44024E602D 120 trials
4A83291D-C22E-4396-985F-536A0BA35F5B 120 trials
5523e30b2fa790bb36387f35dd6a8cdb 243 trials
5e949e269cc08de572108129acd44eb4 72 trials
6FA939D1-7CCB-4D67-8198-7FC95AB013F7 61 trials
78F52907-6E6B-4FBE-BF52-78300B51B77B 120 trials
7ED551B3-9DA3-4FBA-8D2E-F2B50B7ED0BB 180 trials
96E84605-8291-42FE-88E3-EA1A8B1F44D6 120 trials
99D926F9-32FF-40E8-A5A2-82D29C5BA66C 61 trials
AFDE02A6-E690-40AE-A67D-3B38DE3570D9 137 trials
BB0C4835-20B4-4219-8041-477C90870CC1 120 trials
BB5DA2DF-7B3C-454E-8493-14BC0A93A149 102 trials
C5F8E458-27CF-4F09-A320-CCA0ECCCF54A 180 trials
D54BE9D4-118D-4E31-9451-60BA924EAF42 71 trials
F9D147C1-5

In [6]:
full_dfs['OCI'].to_csv(f'{out_dir}/OCI-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove OCI subjects with less than 19 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['OCI'].index):
    subj_df = full_dfs['OCI'].loc[subj_id]
    if len(subj_df) == 19: # completed survey once
        # grab attention check
        attn_check = float(subj_df[subj_df['SurveyQuestion']==15]['Response'])

        # long to wide, exclude attention check
        subj_df = subj_df[subj_df['SurveyQuestion']!=15].pivot(columns='SurveyQuestion', values='Response').add_prefix('OCI_')

        subj_df['OCI_Total'] = subj_df.sum(axis=1, skipna=False)

        if np.isnan(attn_check):
            subj_df['OCI_AttnCheck'] = 1
        else:
            subj_df['OCI_AttnCheck'] = 0

        tmp_dflist += [subj_df]
    elif len(subj_df) > 19:
        print(subj_id, len(subj_df))

preproc_dfs['OCI'] = pd.concat(tmp_dflist)
preproc_dfs['OCI'].to_csv(f'{out_dir}/OCI-data-{todays_date}.csv', index_label='SubjectID')

070048E2-CE06-491C-880D-6C954943E843 20
1A207FB8-923E-4EB2-81AA-D1AC181E670D 20
202788DB-1A75-448A-81EB-6482C5B97C1E 20
34D820E8-6CA7-4637-9EE3-EC86C9A7D48D 20
40421E3F-4390-4B6B-B062-AF44024E602D 20
5523e30b2fa790bb36387f35dd6a8cdb 32
5FD03A4A-77A6-41BD-A4C8-D2EB9130E911 20
669DD1DC-EB3A-4A7B-A75E-3037C9A38778 38
6884BFED-4F1C-4C91-9AE5-DCCCFC029491 76
695A9C75-000E-4781-8552-33E2C528F582 25
7ED551B3-9DA3-4FBA-8D2E-F2B50B7ED0BB 38
8A2D0E6B-11C7-40EF-9E99-D1C3D8CAFA62 20
8BA72A5B-E3EF-4DF6-9F6F-E28085099DD4 21
A298839E-25D2-4BB3-A43E-346FE2D73920 20
A428C902-3440-4AA8-9617-56BA7D16277D 41
A73B20D3-B9CE-4099-9BB7-6C138D862A82 39
B5C7CB4C-3059-4A5C-9896-93E16CE1B56D 38
B642A2A5-9082-4BDA-A2B4-A9C3EA98A2FF 21
BB5DA2DF-7B3C-454E-8493-14BC0A93A149 22
E6EDEED0-0542-439B-8E28-B63D2D44D1F8 38
F0D85366-F2B3-4AFC-A0DA-3FD41AC72B45 20


In [7]:
full_dfs['SDS'].to_csv(f'{out_dir}/SDS-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove SDS subjects with less than 21 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['SDS'].index):
    subj_df = full_dfs['SDS'].loc[subj_id]
    if len(subj_df) == 21: # completed survey once
        # grab attention check
        attn_check = float(subj_df[subj_df['SurveyQuestion']==16]['Response'])

        # long to wide, exclude attention check
        subj_df = subj_df[subj_df['SurveyQuestion']!=16].pivot(columns='SurveyQuestion', values='Response').add_prefix('SDS_')

        subj_df['SDS_Total'] = subj_df.sum(axis=1, skipna=False)

        if np.isnan(attn_check):
            subj_df['SDS_AttnCheck'] = 1
        else:
            subj_df['SDS_AttnCheck'] = 0

        tmp_dflist += [subj_df]
    elif len(subj_df) > 21:
        print(subj_id, len(subj_df))

preproc_dfs['SDS'] = pd.concat(tmp_dflist)
preproc_dfs['SDS'].to_csv(f'{out_dir}/SDS-data-{todays_date}.csv', index_label='SubjectID')

0ECD14EC-631D-4923-8DD4-D4A3E2FAFDBC 42
15A081D3-32F5-4740-9A93-D133D707C099 22
15C38FDF-2C9C-431A-BDF3-D886E47AAFAD 22
2558849D-F345-40B9-BC1B-22987A2EB9C8 22
321BCD37-998E-4F7C-A8FF-6C4E799FB6D4 22
43111F06-A728-47B9-B13E-288F48AA3238 42
451edd83489e8b126787c91fc1ca0fa6 22
5523e30b2fa790bb36387f35dd6a8cdb 23
5FD03A4A-77A6-41BD-A4C8-D2EB9130E911 42
669DD1DC-EB3A-4A7B-A75E-3037C9A38778 42
6884BFED-4F1C-4C91-9AE5-DCCCFC029491 42
8312DACF-5F5F-4BEC-9AC8-2F3CE0D0C47E 42
8611425A-D1FE-4F99-A92E-A67D05E08460 22
86DD495C-3C25-41F1-9872-47A60A992DAC 22
8BA72A5B-E3EF-4DF6-9F6F-E28085099DD4 22
8F4C21EF-5003-4DCA-BE31-1B2055D9C002 42
96E84605-8291-42FE-88E3-EA1A8B1F44D6 24
A12F15C6-4123-49F6-A2F9-C32A9A3F1205 22
B6C35BD6-EE25-4243-96BF-A9D85BC97D22 25
E3E9B23D-D3EB-4F3F-99E9-8D6E8FEB3246 22
F9E76FC7-94F3-4C41-A3A0-F80274451097 42


In [8]:
full_dfs['LSAS'].to_csv(f'{out_dir}/LSAS-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove LSAS subjects with less than 24 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['LSAS'].index):
    subj_df = full_dfs['LSAS'].loc[subj_id]
    if len(subj_df) == 24: # completed survey once
        # long to wide, exclude attention check
        subj_df = subj_df.pivot(columns='SurveyQuestion', values='Response').add_prefix('LSAS_')

        subj_df['LSAS_Total'] = subj_df.sum(axis=1, skipna=False)

        tmp_dflist += [subj_df]
    elif len(subj_df) > 24:
        print(subj_id, len(subj_df))

preproc_dfs['LSAS'] = pd.concat(tmp_dflist)
preproc_dfs['LSAS'].to_csv(f'{out_dir}/LSAS-data-{todays_date}.csv', index_label='SubjectID')

08C278D1-4F43-4B78-9442-69CA52224A97 48
26a372c5a72ded55db6e41a2d0df363c 48
4B574E29-4B9D-4B6B-9AEB-40A981407001 25
62C52355-6880-45EF-B09D-7DD3546E5A24 25
6884BFED-4F1C-4C91-9AE5-DCCCFC029491 48
A298839E-25D2-4BB3-A43E-346FE2D73920 25
CFC6E9E1-5CF8-4971-9B7A-512179DEB71B 25
DC7D480F-01FB-4352-ABB8-D16E673A378F 47
E3E9B23D-D3EB-4F3F-99E9-8D6E8FEB3246 27
F0D85366-F2B3-4AFC-A0DA-3FD41AC72B45 25
F67AA14F-4FCB-4FFF-903A-A469DB86EC74 25
d485fe9a2ade8e0a8ea7cc43a0773032 25
ee7be7b03598aebbff76aa12004bdca9 25


In [9]:
# save demographics
preproc_dfs['Demographics'] = full_dfs['Demographics'].copy()
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['LSAS'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['OCI'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['SDS'])
preproc_dfs['Demographics'].to_csv(f'{out_dir}/Demographics-data-{todays_date}.csv', index_label='SubjectID')

In [10]:
# save as pickle
with open(f'{out_dir}/SocialBrainAppData-{todays_date}.pickle', 'wb') as handle:
    pickle.dump(preproc_dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
print('Total Number of Unique IDs with Any Data')
for name, id_list in id_lists.items():
    print(name, len(np.unique(id_list)))

Total Number of Unique IDs with Any Data
Demographics 270
Hardball 123
HardballSubjectiveRatings 95
OCI 143
SDS 127
LSAS 70
Journey 225


In [12]:
print('Total Number of Unique IDs with Complete Data')
for name, id_list in preproc_dfs.items():
    print(name, len(np.unique(id_list.index)))

Total Number of Unique IDs with Complete Data
Hardball 98
HardballSubjectiveRatings 95
OCI 113
SDS 99
LSAS 55
Demographics 270


In [13]:
# move files after preprocessing is completed
for json_file in json_files:
    os.rename(json_file, f'{json_dir}/processed/{os.path.basename(json_file)}')

In [14]:
# For Development: this is a good data file
# json_file = '..\\data\\2022-07-15_07-00__sb_data.json'
# with open(json_file, 'rb') as handle:
#     tmp_data = json.load(handle)