In [None]:
import pandas as pd, numpy as np, json, pickle, os, warnings
from glob import glob
from datetime import datetime

from socialbrainapp import get_demographics, get_oci, get_sds, get_lsas, get_hardball, get_hardball_ratings, get_hardball_blocks, get_hardball_sessions, get_journey, get_withdrawn_ids

warnings.simplefilter(action='ignore', category=FutureWarning)

# Parsing

In [None]:
# Get today's date
todays_date = datetime.now().strftime("%Y%m%d")

# Load json data
json_dir = '../json'
out_dir = '../data/runs'
json_files = glob(f'{json_dir}/*json')
json_files.sort()
print(f'{len(json_files)} jsons to parse')

In [None]:
# Parse json data
id_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[], 
            'Journey_decisions':[], 'Journey_memory':[], 'Journey_dots':[], 'Journey_characters':[]}

df_lists = {'Demographics':[], 'Hardball':[], 'HardballSubjectiveRatings':[], 
            'OCI':[], 'SDS':[], 'LSAS':[],
            'Journey_decisions':[], 'Journey_memory':[], 'Journey_dots':[], 'Journey_characters':[]}

withdraw_ids = []
withdrawshare_ids = []
for json_file in json_files:
    
    try:
        with open(json_file, 'rb') as handle:
            tmp_data = json.load(handle)

        for element in tmp_data:

            # Check if new version
            if "AppVersion" in element: 
                VersionNum = element["AppVersion"]
            else:
                VersionNum = "1.0"

            # Check for withdrawal
            if 'Withdrawal' in element.keys():
                if element['Withdrawal'] == 'True':
                    withdraw_ids.append(element['UserId'])
            
            if 'WithdrawalShare' in element.keys():
                if element['WithdrawalShare'] == 'True':
                    withdrawshare_ids.append(element['UserId'])

            # Get Demographics
            if 'Age' in element.keys():
                id_lists['Demographics'] += [element['UserId']]
                df_lists['Demographics'] += [get_demographics(element)]

            # Get Survey Data
            if 'SurveyName' in element.keys():

                # Get OCI
                if element['SurveyName'] == 'OCI':
                    id_lists['OCI'] += [element['UserId']]
                    df_lists['OCI'] += [get_oci(element)]

                # Get SDS
                elif element['SurveyName'] == 'SDS':
                    id_lists['SDS'] += [element['UserId']]
                    df_lists['SDS'] += [get_sds(element)]

                # Get LSAS
                elif element['SurveyName'] == 'LSAS':
                    id_lists['LSAS'] += [element['UserId']]
                    df_lists['LSAS'] += [get_lsas(element)]

            # Get Game Data
            if 'Game' in element.keys():
                
                # Get Hardball
                if element['Game'] == 'Hardball':
                    if 'Screen' not in element.keys():
                        id_lists['Hardball'] += [element['UserId']]
                        df_lists['Hardball'] += [get_hardball(element)]

                    if 'Screen' in element.keys():
                        id_lists['HardballSubjectiveRatings'] += [element['UserId']]
                        df_lists['HardballSubjectiveRatings'] += [get_hardball_ratings(element)]

                # Get Journey
                if element['Game'] == 'Journey':
                    
                    try: 
                        task_name, snt_df = get_journey(element)
                        id_lists['Journey_'+task_name] += [element['UserId']]
                        df_lists['Journey_'+task_name] += [snt_df]
                        
                    except: # not interested in some trials but want a better solution here
                        continue

    except:
        continue

# concatenate all df lists
full_dfs = {}
for (key, value) in df_lists.items():
    if len(value) > 0: 
        full_dfs[key] = []
        full_dfs[key] = pd.concat(value)

In [None]:
both_withdraw_noshare, withdraw_only, noshare_only = get_withdrawn_ids(withdraw_ids, withdrawshare_ids)

# Preprocessing

In [None]:
# Preprocess Data
preproc_dfs = {}

## Hardball

In [None]:
full_dfs['Hardball'] = full_dfs['Hardball'].drop_duplicates()
full_dfs['HardballSubjectiveRatings'] = full_dfs['HardballSubjectiveRatings'].drop_duplicates()

Hardball_df_list       = []
HardballRating_df_list = []
for subj_id in np.unique(full_dfs['Hardball'].index):
    # Get subject data and then sort in time
    subj_hardball_df = full_dfs['Hardball'][full_dfs['Hardball'].index==subj_id]
    subj_hardball_df = subj_hardball_df.sort_values(by=['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']).sort_index()

    # Get subjective ratings
    subj_ratings_df = full_dfs['HardballSubjectiveRatings'][full_dfs['HardballSubjectiveRatings'].index==subj_id]

    # Check if withdrawn from study
    if subj_id in both_withdraw_noshare:
        subj_hardball_df.at[subj_id, 'WithdrawType'] = 'FromStudy'
        subj_hardball_df.at[subj_id, 'WithdrawAll'] = 1
        subj_hardball_df.at[subj_id, 'WithdrawStudy'] = 0
        subj_hardball_df.at[subj_id, 'WithdrawData'] = 0
    
    elif subj_id in withdraw_only:
        subj_hardball_df.at[subj_id, 'WithdrawType'] = 'FromStudy'
        subj_hardball_df.at[subj_id, 'WithdrawAll'] = 0
        subj_hardball_df.at[subj_id, 'WithdrawStudy'] = 1
        subj_hardball_df.at[subj_id, 'WithdrawData'] = 0

    elif subj_id in noshare_only:
        subj_hardball_df.at[subj_id, 'WithdrawType'] = 'Data'
        subj_hardball_df.at[subj_id, 'WithdrawAll'] = 0
        subj_hardball_df.at[subj_id, 'WithdrawStudy'] = 0
        subj_hardball_df.at[subj_id, 'WithdrawData'] = 1

    else:
        subj_hardball_df.at[subj_id, 'WithdrawType'] = 'NA'
        subj_hardball_df.at[subj_id, 'WithdrawAll'] = 0
        subj_hardball_df.at[subj_id, 'WithdrawStudy'] = 0
        subj_hardball_df.at[subj_id, 'WithdrawData'] = 0

    ntrials = len(subj_hardball_df)
    if ntrials < 60: # not completed
        continue
        
    else: # 60 trials or more
        subj_hardball_df.at[subj_id, 'NTrials'] = ntrials

        # identify blocks for each subject
        subj_hardball_df_blocks = get_hardball_blocks(subj_hardball_df.reset_index())
        subj_hardball_df_sessions = get_hardball_sessions(subj_hardball_df_blocks)

        # Add to list of dfs:
        Hardball_df_list       += [subj_hardball_df_sessions]
        HardballRating_df_list += [subj_ratings_df]

# combine subject dfs
preproc_dfs['Hardball'] = pd.concat(Hardball_df_list)
preproc_dfs['Hardball'] = preproc_dfs['Hardball'].reset_index()
preproc_dfs['Hardball']['DateTime'] = pd.to_datetime(preproc_dfs['Hardball'][['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']])

preproc_dfs['HardballSubjectiveRatings'] = pd.concat(HardballRating_df_list)
preproc_dfs['HardballSubjectiveRatings'] = preproc_dfs['HardballSubjectiveRatings'].reset_index()
preproc_dfs['HardballSubjectiveRatings']['DateTime'] = pd.to_datetime(preproc_dfs['HardballSubjectiveRatings'][['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']])

In [None]:
# Create new column in df2 to store the Rate values from df1
preproc_dfs['Hardball']['PerceivedControl'] = np.nan
preproc_dfs['Hardball']['RateDateTime'] = np.nan

# Loop through rows in df1
for i, row in preproc_dfs['HardballSubjectiveRatings'].iterrows():
    subj_id = row['index']
    team_id = row['TeamName']

    # Find the SessionID in df2 that is closest in time to the current row
    # TO DO : Find sessionID that is earlier for behav
    behav_time1 = preproc_dfs['Hardball'][preproc_dfs['Hardball']['index']==subj_id]['DateTime']
    idx = abs(row['DateTime'] - behav_time1[behav_time1<row['DateTime']]).idxmin()

    # idx = abs(preproc_dfs['Hardball'][preproc_dfs['Hardball']['index']==subj_id][preproc_dfs['Hardball']['DateTime']<row['DateTime']]['DateTime'] - row['DateTime']).idxmin()
    
    sess_id = preproc_dfs['Hardball'][preproc_dfs['Hardball']['index']==subj_id].at[idx, 'SessionID']

    # Write df1['Rate'] values for all matching index, TeamName, and BlockID
    rows_meeting_criteria = (preproc_dfs['Hardball']['index'] == subj_id) & (preproc_dfs['Hardball']['TeamName'] == team_id) & (preproc_dfs['Hardball']['SessionID'] == sess_id)

    if all(preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'PerceivedControl'].isna()):
        preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'PerceivedControl'] = row['Rate']
        preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'RateDateTime'] = row['DateTime']
    else:
        # check which is closer to behav
        behav_time = preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'DateTime'].iloc[-1]
        prev_rating_time = preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'RateDateTime'].iloc[0]
        new_rating_time = row['DateTime']

        if abs(prev_rating_time - behav_time) < abs(new_rating_time - behav_time):
            # overwrite
            preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'PerceivedControl'] = row['Rate']
            preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'RateDateTime'] = row['DateTime']
        else: 
            print(subj_id)
            # put in other column
            preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'PerceivedControl2'] = row['Rate']
            preproc_dfs['Hardball'].loc[rows_meeting_criteria, 'Rate2DateTimeTime'] = row['DateTime']


In [None]:
preproc_dfs['Hardball'] = preproc_dfs['Hardball'].rename({'index':'SubjectID','level_0':'TrialNum'},axis='columns')
preproc_dfs['Hardball'].to_csv(f'{out_dir}/Hardball-data-{todays_date}.csv', index_label='Index')

## Journey

In [None]:
for journey in ['Journey_decisions', 'Journey_memory', 'Journey_characters']:
    df = full_dfs[journey].copy().reset_index()
    preproc_dfs[journey] = df.rename(columns={'index': 'sub_id'})

# remove journey subjects with less than 63 completed trials
snt = []
ver = []
mem = []
for sub_id in np.unique(full_dfs['Journey_decisions'].index):
    
    ntrials = len(full_dfs['Journey_decisions'][full_dfs['Journey_decisions'].index == sub_id])
    
    # preprocess anyone w/ 63 or more trials 
    if ntrials >= 63:
        
        sub_df = full_dfs['Journey_decisions'][full_dfs['Journey_decisions'].index == sub_id].copy()
        sub_df['Num_trials'] = ntrials
        sub_df['Include'] = 1
        
        # if two trials are > 30 minutes apart define them as separate sessions...
        s = 1
        sessions = [[1, 0]]
        datetimes = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S.%f") for dt in sub_df['Datetime'].values]
        for d, dt in enumerate(datetimes[1:]):
            elapsed = dt - datetimes[d]
            if elapsed.seconds > (60 * 30): 
                s += 1
            sessions.append([s, elapsed.seconds])
        sub_df[['Session', 'Elapsed(s)']] = sessions
        
        # count number of repeats for each decision
        counts = sub_df['decision_num'].value_counts()
        sub_df['Num_repeats'] = [counts[c] for c in sub_df['decision_num'].values]
        
        # put together
        sub_ver = full_dfs['Journey_characters'][full_dfs['Journey_characters'].index == sub_id].copy()
        if len(sub_ver) > 1: 
            print(sub_id + ' multiple task versions')
            sub_df['Mult_versions'] = 1
        else:
            sub_df['Mult_versions'] = 0
            
        snt.append(sub_df)
        ver.append(sub_ver)
        mem.append(full_dfs['Journey_memory'][full_dfs['Journey_memory'].index == sub_id].copy())

# output
pd.concat(snt).to_csv(f'{out_dir}/SNT_data_{todays_date}.csv', index_label='sub_id')
pd.concat(mem).to_csv(f'{out_dir}/SNT-memory_data_{todays_date}.csv', index_label='sub_id')
pd.concat(ver).to_csv(f'{out_dir}/SNT-ver_data_{todays_date}.csv', index_label='sub_id')

# TO DO: output role of the characters selected in memory by referencing ver 

## Demographics + Questionnaires

In [None]:
full_dfs['OCI'].to_csv(f'{out_dir}/OCI-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove OCI subjects with less than 19 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['OCI'].index):
    subj_df = full_dfs['OCI'].loc[subj_id]
    if len(np.unique(subj_df['SurveyQuestion'])) == 19: # completed survey once
        # grab attention check
        attn_check = float(subj_df[subj_df['SurveyQuestion']==15]['Response'])

        # long to wide, exclude attention check
        subj_df = subj_df[subj_df['SurveyQuestion']!=15].drop_duplicates().pivot(columns='SurveyQuestion', values='Response').add_prefix('OCI_')

        subj_df['OCI_Total'] = subj_df.sum(axis=1, skipna=False)

        if np.isnan(attn_check):
            subj_df['OCI_AttnCheck'] = 1
        else:
            subj_df['OCI_AttnCheck'] = 0

        tmp_dflist += [subj_df]
    elif len(subj_df) > 19:
        print(subj_id, len(subj_df))

preproc_dfs['OCI'] = pd.concat(tmp_dflist)
preproc_dfs['OCI'].to_csv(f'{out_dir}/OCI-data-{todays_date}.csv', index_label='SubjectID')

In [None]:
full_dfs['SDS'].to_csv(f'{out_dir}/SDS-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove SDS subjects with less than 21 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['SDS'].index):
    subj_df = full_dfs['SDS'].loc[subj_id]
    if len(np.unique(subj_df['SurveyQuestion'])) == 21: # completed survey once
        # grab attention check
        attn_check = float(subj_df[subj_df['SurveyQuestion']==16]['Response'])

        # long to wide, exclude attention check
        subj_df = subj_df[subj_df['SurveyQuestion']!=16].drop_duplicates().pivot(columns='SurveyQuestion', values='Response').add_prefix('SDS_')

        subj_df['SDS_Total'] = subj_df.sum(axis=1, skipna=False)

        if np.isnan(attn_check):
            subj_df['SDS_AttnCheck'] = 1
        else:
            subj_df['SDS_AttnCheck'] = 0

        tmp_dflist += [subj_df]
    elif len(subj_df) > 21:
        print(subj_id, len(subj_df))

preproc_dfs['SDS'] = pd.concat(tmp_dflist)
preproc_dfs['SDS'].to_csv(f'{out_dir}/SDS-data-{todays_date}.csv', index_label='SubjectID')

In [None]:
full_dfs['LSAS'].to_csv(f'{out_dir}/LSAS-rawdata-{todays_date}.csv', index_label='SubjectID')

# remove LSAS subjects with less than 24 completed trials
tmp_dflist = []
for subj_id in np.unique(full_dfs['LSAS'].index):
    subj_df = full_dfs['LSAS'].loc[subj_id]
    if len(np.unique(subj_df['SurveyQuestion'])) == 24: # completed survey once
        # long to wide, exclude attention check
        subj_df = subj_df.drop_duplicates(subset=['SurveyQuestion']).pivot(columns='SurveyQuestion', values='Response').add_prefix('LSAS_')

        subj_df['LSAS_Total'] = subj_df.sum(axis=1, skipna=False)

        tmp_dflist += [subj_df]
    elif len(subj_df) > 24:
        print(subj_id, len(subj_df))

preproc_dfs['LSAS'] = pd.concat(tmp_dflist)
preproc_dfs['LSAS'].to_csv(f'{out_dir}/LSAS-data-{todays_date}.csv', index_label='SubjectID')

In [None]:
hardball_sess1 = preproc_dfs['Hardball'][preproc_dfs['Hardball']['SessionID']==1]
hardball_sess1 = hardball_sess1[hardball_sess1['OpponentNum']==1]

In [None]:
hardball_sess1_IC = hardball_sess1[hardball_sess1['Condition']=="Control"]
hardball_sess1_IC = hardball_sess1_IC.rename({'PerceivedControl':'PerceivedControl_IC'},axis='columns')
hardball_sess1_IC.set_index('SubjectID',inplace=True)

hardball_sess1_NC = hardball_sess1[hardball_sess1['Condition']=="NoControl"]
hardball_sess1_NC = hardball_sess1_NC.rename({'PerceivedControl':'PerceivedControl_NC'},axis='columns')
hardball_sess1_NC.set_index('SubjectID',inplace=True)


In [None]:
# save demographics
preproc_dfs['Demographics'] = full_dfs['Demographics'].drop_duplicates().copy()
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['LSAS'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['OCI'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(preproc_dfs['SDS'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(hardball_sess1_NC['PerceivedControl_NC'])
preproc_dfs['Demographics'] = preproc_dfs['Demographics'].join(hardball_sess1_IC['PerceivedControl_IC'])

for subj_id in np.unique(preproc_dfs['Demographics'].index):
    
    # Check if withdrawn from study
    if subj_id in both_withdraw_noshare:
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawType'] = 'FromStudy'
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawAll'] = 1
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawStudy'] = 0
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawData'] = 0
    
    elif subj_id in withdraw_only:
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawType'] = 'FromStudy'
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawAll'] = 0
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawStudy'] = 1
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawData'] = 0

    elif subj_id in noshare_only:
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawType'] = 'Data'
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawAll'] = 0
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawStudy'] = 0
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawData'] = 1

    else:
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawType'] = 'NA'
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawAll'] = 0
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawStudy'] = 0
        preproc_dfs['Demographics'].at[subj_id, 'WithdrawData'] = 0

preproc_dfs['Demographics'].to_csv(f'{out_dir}/Demographics-data-{todays_date}.csv', index_label='SubjectID')

In [None]:
# save as pickle
with open(f'{out_dir}/SocialBrainAppData-{todays_date}.pickle', 'wb') as handle:
    pickle.dump(preproc_dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# move files after preprocessing is completed
for json_file in json_files[1:]:
    os.rename(json_file, f'{json_dir}/processed/{os.path.basename(json_file)}')

In [None]:
print('Number of New Unique IDs with Any Data')
for name, id_list in id_lists.items():
    print(name, len(np.unique(id_list)))

In [None]:
print('Number of New Unique IDs with Complete Data')
for name, id_list in preproc_dfs.items():
    if name == 'Hardball':
        print(name, len(np.unique(id_list['SubjectID'])))
    elif name == 'HardballSubjectiveRatings':
        print(name, len(np.unique(id_list['index'])))
    else:
        print(name, len(np.unique(id_list.index)))

In [None]:
import matplotlib.pyplot as plt

preproc_dfs['Demographics']['Date'] = pd.to_datetime(preproc_dfs['Demographics'][['Year','Month','Day']])
dates_df = preproc_dfs['Demographics'].groupby(['Date']).size()

plt.figure(figsize=(15,3))
ax = dates_df.plot()
plt.gcf().autofmt_xdate()
plt.savefig(f'counts_{todays_date}.jpg',bbox_inches='tight',dpi=450)
plt.show()