In [2]:
import pandas as pd, numpy as np, json, pickle
from glob import glob

from socialbrainapp import get_demographics, get_oci, get_sds, get_lsas, get_hardball, get_hardball_ratings

In [3]:
# Load json data
json_dir = '../json'
out_dir = '../data'
json_files = glob(f'{json_dir}/*json')

In [4]:
# Parse json data
id_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[], 'Journey':[]}

df_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[], 'Journey':[]}

for json_file in json_files:
    try:
        with open(json_file, 'rb') as handle:
            tmp_data = json.load(handle)
        
        for element in tmp_data:
            
            # Get Demographics
            if 'Age' in element.keys():
                id_lists['Demographics'] += [element['UserId']]
                df_lists['Demographics'] += [get_demographics(element)]
            
            # Get Survey Data
            if 'SurveyName' in element.keys():

                # Get OCI
                if element['SurveyName'] == 'OCI':
                    id_lists['OCI'] += [element['UserId']]
                    df_lists['OCI'] += [get_oci(element)]

                # Get SDS
                elif element['SurveyName'] == 'SDS':
                    id_lists['SDS'] += [element['UserId']]
                    df_lists['SDS'] += [get_sds(element)]

                # Get LSAS
                elif element['SurveyName'] == 'LSAS':
                    id_lists['LSAS'] += [element['UserId']]
                    df_lists['LSAS'] += [get_lsas(element)]
            
            # Get Game Data
            if 'Game' in element.keys():
                
                # Get Hardball
                if element['Game'] == 'Hardball':
                    if 'Screen' not in element.keys():
                        id_lists['Hardball'] += [element['UserId']]
                        df_lists['Hardball'] += [get_hardball(element)]

                    if 'Screen' in element.keys():
                        id_lists['HardballSubjectiveRatings'] += [element['UserId']]
                        df_lists['HardballSubjectiveRatings'] += [get_hardball_ratings(element)]
                
                # Get Journey
                if element['Game'] == 'Journey':
                    id_lists['Journey'] += [element['UserId']]
                    # TBD
                    df_lists['Journey'] += [element['UserId']]
    except:
        continue

# concatenate all df lists
full_dfs = {}
for (key, value) in df_lists.items():
    if key == 'Journey':
        continue
    full_dfs[key] = pd.concat(value)

In [5]:
# Preprocess Data
preproc_dfs = {}

In [6]:
# remove Hardball subjects with less than 60 completed trials
preproc_dfs['Hardball'] = full_dfs['Hardball'].copy()
preproc_dfs['HardballSubjectiveRatings'] = full_dfs['HardballSubjectiveRatings'].copy()

for subj_id in np.unique(full_dfs['Hardball'].index):
    ntrials = len(full_dfs['Hardball'][full_dfs['Hardball'].index==subj_id])
    if ntrials < 60:
        preproc_dfs['Hardball'] = preproc_dfs['Hardball'].drop(subj_id)
        try:
            preproc_dfs['HardballSubjectiveRatings'] = preproc_dfs['HardballSubjectiveRatings'].drop(subj_id)
        except:
            continue
    elif ntrials > 60:
        print(subj_id, f'{ntrials} trials')

preproc_dfs['HardballSubjectiveRatings'].to_csv(f'{out_dir}/HardballSubjectiveRatings-data.csv', index_label='SubjectID')

# combine dfs
print('\nCombining DFs')
text_to_display = []
preproc_dfs['Hardball'] = preproc_dfs['Hardball'].reset_index()

for idx in preproc_dfs['Hardball'].index:
    subj_id = preproc_dfs['Hardball']['index'][idx]
    team_id = preproc_dfs['Hardball']['TeamName'][idx]
    year_id = preproc_dfs['Hardball']['Year'][idx]
    month_id = preproc_dfs['Hardball']['Month'][idx]
    day_id = preproc_dfs['Hardball']['Day'][idx]

    try:
        influence_rating = float(preproc_dfs['HardballSubjectiveRatings'][(preproc_dfs['HardballSubjectiveRatings']['TeamName']==team_id) & (preproc_dfs['HardballSubjectiveRatings']['Year']==year_id) & (preproc_dfs['HardballSubjectiveRatings']['Month']==month_id) & (preproc_dfs['HardballSubjectiveRatings']['Day']==day_id)].at[subj_id, 'Rate'])
        preproc_dfs['Hardball'].at[idx, 'InfluenceRating'] = influence_rating
    except:
        if f'No influence rating for {subj_id}' not in text_to_display:
            text_to_display += [f'No influence rating for {subj_id}']
            print(f'No influence rating for {subj_id}')
        continue

preproc_dfs['Hardball'] = preproc_dfs['Hardball'].set_index('index')
preproc_dfs['Hardball'].to_csv(f'{out_dir}/Hardball-data.csv', index_label='SubjectID')


01C09A15-F285-4CD1-A1B8-623F60882DE2 120 trials
108E056E-5450-4FCF-9B49-7934ECB62AD6 240 trials
16ED2B73-2D91-4188-95B3-FE78A3D25B5E 61 trials
173C8FF4-084A-4D16-B098-790C15D90CA6 80 trials
242DC553-A15C-4150-8067-5081F6FC073A 180 trials
26a372c5a72ded55db6e41a2d0df363c 180 trials
34D820E8-6CA7-4637-9EE3-EC86C9A7D48D 120 trials
40421E3F-4390-4B6B-B062-AF44024E602D 120 trials
4A83291D-C22E-4396-985F-536A0BA35F5B 120 trials
5523e30b2fa790bb36387f35dd6a8cdb 243 trials
5e949e269cc08de572108129acd44eb4 72 trials
6FA939D1-7CCB-4D67-8198-7FC95AB013F7 61 trials
7ED551B3-9DA3-4FBA-8D2E-F2B50B7ED0BB 120 trials
96E84605-8291-42FE-88E3-EA1A8B1F44D6 120 trials
99D926F9-32FF-40E8-A5A2-82D29C5BA66C 61 trials
AFDE02A6-E690-40AE-A67D-3B38DE3570D9 137 trials
BB0C4835-20B4-4219-8041-477C90870CC1 120 trials
BB5DA2DF-7B3C-454E-8493-14BC0A93A149 102 trials
C5F8E458-27CF-4F09-A320-CCA0ECCCF54A 180 trials
D54BE9D4-118D-4E31-9451-60BA924EAF42 71 trials
F9D147C1-557E-4B46-89DB-9593CD794F13 93 trials
a3e72760af4

In [7]:
# remove OCI subjects with less than 19 completed trials
preproc_dfs['OCI'] = full_dfs['OCI'].copy()
for subj_id in np.unique(full_dfs['OCI'].index):
    if len(full_dfs['OCI'][full_dfs['OCI'].index==subj_id]) != 19:
        preproc_dfs['OCI'] = preproc_dfs['OCI'].drop(subj_id)

preproc_dfs['OCI'].to_csv(f'{out_dir}/OCI-data.csv', index_label='SubjectID')

In [8]:
# remove SDS subjects with less than 21 completed trials
preproc_dfs['SDS'] = full_dfs['SDS'].copy()
for subj_id in np.unique(full_dfs['SDS'].index):
    if len(full_dfs['SDS'][full_dfs['SDS'].index==subj_id]) != 21:
        preproc_dfs['SDS'] = preproc_dfs['SDS'].drop(subj_id)

preproc_dfs['SDS'].to_csv(f'{out_dir}/SDS-data.csv', index_label='SubjectID')

In [9]:
# remove LSAS subjects with less than 24 completed trials
preproc_dfs['LSAS'] = full_dfs['LSAS'].copy()
for subj_id in np.unique(full_dfs['LSAS'].index):
    if len(full_dfs['LSAS'][full_dfs['LSAS'].index==subj_id]) != 24:
        preproc_dfs['LSAS'] = preproc_dfs['LSAS'].drop(subj_id)

preproc_dfs['LSAS'].to_csv(f'{out_dir}/LSAS-data.csv', index_label='SubjectID')

In [10]:
# save demographics
preproc_dfs['Demographics'] = full_dfs['Demographics'].copy()
preproc_dfs['Demographics'].to_csv(f'{out_dir}/Demographics-data.csv', index_label='SubjectID')

In [11]:
# save as pickle
with open(f'{out_dir}/SocialBrainAppData.pickle', 'wb') as handle:
    pickle.dump(preproc_dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
print('Total Number of Unique IDs with Any Data')
for name, id_list in id_lists.items():
    print(name, len(np.unique(id_list)))

Total Number of Unique IDs with Any Data
Demographics 258
Hardball 117
HardballSubjectiveRatings 91
OCI 136
SDS 121
LSAS 66
Journey 215


In [13]:
print('Total Number of Unique IDs with Complete Data')
for name, id_list in preproc_dfs.items():
    print(name, len(np.unique(id_list.index)))

Total Number of Unique IDs with Complete Data
Hardball 94
HardballSubjectiveRatings 91
OCI 110
SDS 94
LSAS 52
Demographics 258


In [14]:
# For Development: this is a good data file
# json_file = '..\\data\\2022-07-15_07-00__sb_data.json'
# with open(json_file, 'rb') as handle:
#     tmp_data = json.load(handle)