# Data Cleaning

In [None]:
import sys
import pandas as pd
from datetime import timedelta
from importlib import reload
import numpy as np

In [None]:
sys.path.insert(0, '../code/')

In [None]:
import util
import cleaning_util

## CrossCheck Daily Data Feature Prep

In [None]:
# Upload data
daily_data = pd.read_csv('path/to/crosscheck/daily/data/file')

In [None]:
# Prepping
daily_data['date'] = pd.to_datetime(daily_data['day'], format='%Y%m%d')

In [None]:
# Get potential features
feature_cols = [f for f in daily_data.columns.values if f not in \
                    ['study_id', 'eureka_id', 'day', 'date']]
ema_cols = [f for f in feature_cols if 'ema' in f]
behavior_cols = [f for f in feature_cols if 'ema' not in f]

In [None]:
# Sort
daily_data = daily_data.sort_values(['study_id', 'date']).reset_index(drop=True)
# Copy over EMA columns
crosscheck_df = daily_data[['study_id', 'eureka_id', 'date'] + ema_cols].copy()
for f in behavior_cols:
    crosscheck_df[f] = None
# Add a column to collect missing days of data
crosscheck_df['missing_days'] = 0

# Go through each study ID
curr = 0
for s in daily_data.study_id.unique():
    if (curr % 1) == 0:
        print(curr)
    # Go through each EMA date, discarding the first EMA taken
    for ind in daily_data.loc[
        (daily_data.study_id == s) & (pd.isnull(daily_data[ema_cols]).sum(axis=1) == 0), :].index[1:]:
        # Get date
        d = daily_data.loc[ind, 'date']
        # Now see if data exists in other df
        start_date = d - timedelta(days=2)
        end_date = d
        filtered_df = daily_data.loc[
            (daily_data.study_id == s) & (daily_data.date >= start_date) & (daily_data.date <= end_date), :
        ]
        if filtered_df.shape[0] > 0:
            # Get mean
            crosscheck_df.loc[ind, behavior_cols] = filtered_df[behavior_cols].mean().values
            # Check for null values across all columns
        crosscheck_df.loc[ind, 'missing_days'] = 3 - filtered_df.shape[0]
        
    curr += 1

In [None]:
# Drop all rowss where there is not EMA data
crosscheck_df_cleaned = crosscheck_df.dropna(subset=ema_cols)
# Drop all rows where this is no behavioral data and no missing data was marked
# These should be the first EMA
crosscheck_df_cleaned = crosscheck_df_cleaned.loc[~(
        (pd.isnull(crosscheck_df_cleaned[behavior_cols]).sum(axis=1) == len(behavior_cols)) & \
        (crosscheck_df_cleaned.missing_days < 3)
    ), :
]

In [None]:
crosscheck_df_cleaned.to_csv('../data/crosscheck_daily_data_cleaned_w_sameday.csv', index=True)

## StudentLife Data

### EMA Files

In [None]:
studentlife_server_loc = 'path/to/raw/studentlife/folder'

In [None]:
ema_social_files = util.upload_directory(
    studentlife_server_loc + '/studentlife/dataset/EMA/response/Social/',
    file_type='json'
)

In [None]:
ema_stress_files = util.upload_directory(
    studentlife_server_loc + '/studentlife/dataset/EMA/response/Stress/',
    file_type='json'
)

In [None]:
ema_sleep_files = util.upload_directory(
    studentlife_server_loc + '/studentlife/dataset/EMA/response/Sleep/',
    file_type='json'
)

In [None]:
ema_behavior_files = util.upload_directory(
    studentlife_server_loc + '/studentlife/dataset/EMA/response/Behavior/',
    file_type='json'
)

In [None]:
ema_mood_files = util.upload_directory(
    studentlife_server_loc + '/studentlife/dataset/EMA/response/Mood/',
    file_type='json'
)

In [None]:
ema_pam_files = util.upload_directory(
    studentlife_server_loc + '/studentlife/dataset/EMA/response/PAM/',
    file_type='json'
)

#### Prep EMA data

In [None]:
# Make dfs from EMA data
ema_mood_df = cleaning_util.prep_studentlife_df(ema_mood_files)
ema_social_df = cleaning_util.prep_studentlife_df(ema_social_files)
ema_stress_df = cleaning_util.prep_studentlife_df(ema_stress_files)
ema_sleep_df = cleaning_util.prep_studentlife_df(ema_sleep_files)
ema_behavior_df = cleaning_util.prep_studentlife_df(ema_behavior_files)
ema_pam_df = cleaning_util.prep_studentlife_df(ema_pam_files)

In [None]:
studentlife_ema_df = cleaning_util.prep_ema_data(
    [ema_mood_df, ema_social_df, ema_stress_df, ema_sleep_df, ema_behavior_df, ema_pam_df]
)

In [None]:
studentlife_ema_df.to_csv('../data/studentlife_ema_df_01192020.csv', index=False)

In [None]:
studentlife_ema_df = pd.read_csv('../data/studentlife_ema_df_01192020.csv')

#### Prep behavior files

In [None]:
studentlife_server_loc = 'path/to/raw/studentlife/folder'

In [None]:
activity_files = util.upload_directory(studentlife_server_loc + '/studentlife/dataset/sensing/activity/')

In [None]:
conversation_files = util.upload_directory(studentlife_server_loc + '/studentlife/dataset/sensing/conversation/')

In [None]:
gps_files = util.upload_directory(studentlife_server_loc + '/studentlife/dataset/sensing/gps/')

In [None]:
phone_lock_files = util.upload_directory(studentlife_server_loc + '/studentlife/dataset/sensing/phonelock/')

##### Activity

In [None]:
activity_df = cleaning_util.clean_studentlife_activity(activity_files)

In [None]:
activity_df.to_csv('../data/studentlife_activity_03102020.csv', index=False)

In [None]:
activity_df = pd.read_csv('../data/studentlife_activity_03102020.csv')

##### Conversations

In [None]:
conversation_df = cleaning_util.clean_studentlife_conversations(conversation_files)

In [None]:
conversation_df.to_csv('../data/studentlife_conversations_01192020.csv', index=False)

##### Phone unlock

In [None]:
unlock_df = cleaning_util.clean_studentlife_unlock(phone_lock_files)

In [None]:
unlock_df.to_csv('../data/studentlife_unlock_08282021.csv', index=False)

##### GPS location

In [None]:
gps_df = cleaning_util.clean_studentlife_location(gps_files)

In [None]:
gps_df.to_csv('../data/studentlife_gps_01192020.csv', index=False)

#### Sleep

In [None]:
sleep_df = cleaning_util.clean_sleep_data(
    phone_lock_files, cutoff_duration=15, start_time=23, ema_df=studentlife_ema_df,
    correction='median'
)

In [None]:
sleep_df.to_csv('../data/studentlife_sleep_03192020.csv', index=False)

### Get "good days" >= 19 hours of day

In [None]:
good_days = cleaning_util.get_good_days(dfs=activity_files)

In [None]:
good_days.to_csv('../data/studentlife_good_days_03192020.csv', index=False)

### StudentLife Merge

In [None]:
activity_df = pd.read_csv('../data/studentlife_activity_03102020.csv')
unlock_df = pd.read_csv('../data/studentlife_unlock_08282021.csv')
conversation_df = pd.read_csv('../data/studentlife_conversations_01192020.csv')
gps_df = pd.read_csv('../data/studentlife_gps_01192020.csv')
sleep_df = pd.read_csv('../data/studentlife_sleep_03192020.csv')

# Good days of data
good_days = pd.read_csv('../data/studentlife_good_days_03192020.csv')

activity_df['day'] = pd.to_datetime(activity_df['day']).astype(str)
unlock_df['day'] = pd.to_datetime(unlock_df['day']).astype(str)
conversation_df['day'] = pd.to_datetime(conversation_df['day']).astype(str)
gps_df['day'] = pd.to_datetime(gps_df['day']).astype(str)
sleep_df['day'] = pd.to_datetime(sleep_df['day']).astype(str)
good_days['day'] = pd.to_datetime(good_days['day']).astype(str)

In [None]:
dfs = [studentlife_ema_df, activity_df, conversation_df, gps_df, sleep_df, unlock_df]

merged_df = good_days[['study_id', 'day']].copy()
for df in dfs:
    if merged_df is None:
        merged_df = df.copy()
    else:
        merged_df = pd.merge(left=merged_df, right=df, on=['study_id', 'day'], how='left')

In [None]:
merged_df.to_csv('../data/studentlife_daily_data_08282021.csv', index=False)

### StudentLife prep for prediction

In [None]:
merged_df = pd.read_csv('../data/studentlife_daily_data_08282021.csv')

In [None]:
sl_daily_df = merged_df.copy()

In [None]:
sl_daily_df['day'] = pd.to_datetime(sl_daily_df['day'])

In [None]:
# Get potential features
sl_feature_cols = [f for f in sl_daily_df.columns.values if f not in ['study_id', 'day']]
sl_ema_cols = [f for f in sl_feature_cols if 'ema' in f]
sl_behavior_cols = [f for f in sl_feature_cols if 'ema' not in f]

In [None]:
# Sort
sl_daily_df = sl_daily_df.sort_values(['study_id', 'day']).reset_index(drop=True)
# Copy over EMA columns
sl_df = sl_daily_df[['study_id', 'day'] + sl_ema_cols].copy()
for f in sl_behavior_cols:
    sl_df[f] = None
# Add a column to collect missing days of data
sl_df['missing_days'] = 0

# Go through each study ID
curr = 0

keep_index = []

for s in sl_daily_df.study_id.unique():
    if (curr % 1) == 0:
        print(curr)
    # Go through each EMA date, discarding the first EMA taken
    for ind in sl_daily_df.loc[
        (sl_daily_df.study_id == s) &
        (((~pd.isnull(sl_daily_df[sl_ema_cols])).sum(axis=1)) > 0), :
    ].index[1:]:
        # Get date
        d = sl_daily_df.loc[ind, 'day']
        # Now see if data exists in other df
        start_date = d - timedelta(days=2)
        end_date = d
        filtered_df = sl_daily_df.loc[
            (sl_daily_df.study_id == s) & (sl_daily_df.day >= start_date) & \
            (sl_daily_df.day <= end_date), :
        ]
        if filtered_df.shape[0] > 0:
            # Get mean
            sl_df.loc[ind, sl_behavior_cols] = filtered_df[sl_behavior_cols].mean().values
            # Check for null values across all columns
        sl_df.loc[ind, 'missing_days'] = 3 - filtered_df.shape[0]
        
    curr += 1

In [None]:
# Drop all rows where this is no behavioral data and no missing data was marked
sl_df_cleaned = sl_df.copy()
# These should be the first EMA
sl_df_cleaned = sl_df_cleaned.loc[~(
        (pd.isnull(sl_df_cleaned[sl_behavior_cols]).sum(axis=1) == len(sl_behavior_cols)) & \
        (sl_df_cleaned.missing_days < 3)
    ), :
]

In [None]:
sl_df_cleaned.to_csv('../data/studentlife_daily_data_cleaned_w_sameday_08282021.csv', index=False)