In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime
import json
import re
import os, gzip, shutil, fnmatch

import sklearn
import matplotlib
from zipfile import BadZipfile
from sklearn import preprocessing
from matplotlib import pyplot as plt 

___
#### Dataframe of Extracted Data from Daniel
___

In [2]:
deviceUsagePath = "/Users/farhan/DNL/BuddingScholar/Budding_Scholar_22-23/ExtractedData/run1_sk_deviceUsage.csv"
keyboardPath = "/Users/farhan/DNL/BuddingScholar/Budding_Scholar_22-23/ExtractedData/run1_sk_keyboard.csv"
surveyPath = "/Users/farhan/DNL/BuddingScholar/Budding_Scholar_22-23/ExtractedData/run1_survey_results.csv"

device = pd.read_csv(deviceUsagePath).reset_index(drop=True)
keyboard = pd.read_csv(keyboardPath).reset_index(drop=True)
survey = pd.read_csv(surveyPath).reset_index(drop=True)

In [3]:
col = [col for col in keyboard.columns if (col.startswith("keyboard_sentiment") or col.startswith("ParticipantIdentifier") or col.startswith("trial_date"))]
sentimentDF = keyboard[col]

___
##### Apple Sensorkit Sentiment Definitions:
- case absolutist || A mood that embodies absolutism. = 0
- case anger || A mood that embodies anger. = 4
- case anxiety || A mood that embodies worrying. = 3
- case confused || A mood that embodies confusion. = 9
- case death || A mood that expresses death. = 2
- case down || A mood that embodies depression. = 1
- case health || A general concern for health. = 5
- case lowEnergy A mood that indicates low energy. = 8
- case positive || A mood that embodies positivity. = 6
- case sad || A mood that embodies sadness. = 7
___

___
Path to raw data from myDataHelps
___

In [4]:
directory = "/Users/farhan/DNL/BuddingScholar/Budding_Scholar_22-23/Data/"
metric_folder = "sensorkit-keyboard-metrics/iPhone"
participant = "2baee05a-5e5a-4436-8c25-2628d46d1e08/4684F36F-66CC-4FB1-9383-3BC2B008D365/"

___
Helper Functions
- Decompression
- Date fixing
- Formatting Data
___

In [7]:

## Iterative decompression
from gzip import BadGzipFile


def gz_extract(directory):
    extension = ".gz"
    os.chdir(directory)
    for item in os.listdir(directory): # loop through items in dir
      if item.endswith(extension): # check for ".gz" extension
          gz_name = os.path.abspath(item) # get full path of files
          file_name = (os.path.basename(gz_name)).rsplit('.',1)[0] #get file name for file within
          try:
            with gzip.open(gz_name,"rb") as f_in, open(file_name,"wb") as f_out:
              # print(gz_name)
              shutil.copyfileobj(f_in, f_out)
            os.remove(gz_name) # delete zipped file
          except BadZipfile:
            continue
          except BadGzipFile:
            continue

## returns a properly formatted word/emojiList
def get_sentiment_list(emojiList: list):
    returnList = [0,0,0,0,0,0,0,0,0,0]

    for i in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]:
      if (emojiList[i] == 0):
        returnList[0] = int(emojiList[i + 1])
      if (emojiList[i] == 1):
        returnList[1] = int(emojiList[i + 1])
      if (emojiList[i] == 2):
        returnList[2] = int(emojiList[i + 1])
      if (emojiList[i] == 3):
        returnList[3] = int(emojiList[i + 1])
      if (emojiList[i] == 4):
        returnList[4] = int(emojiList[i + 1])
      if (emojiList[i] == 5):
        returnList[5] = int(emojiList[i + 1])
      if (emojiList[i] == 6):  
        returnList[6] = int(emojiList[i + 1])
      if (emojiList[i] == 7):
        returnList[7] = int(emojiList[i + 1])
      if (emojiList[i] == 8):
        returnList[8] = int(emojiList[i + 1])
      if (emojiList[i] == 9):
        returnList[9] = int(emojiList[i + 1])
    return returnList

In [6]:
from dateutil import parser

def fix_date(end_date):
    # if pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).hour < 5:
    #     return pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).date() - datetime.timedelta(days=1)
    # else:
    #     return pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).date()
    d = parser.parse(end_date)
    return (d + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)

def get_date_from_timestamp(timestamp):
    return pd.to_datetime(timestamp, format= '%Y-%m-%d', utc=True).date()

def get_time_from_timestamp(timestamp):
    return pd.to_datetime(timestamp, format= '%Y-%m-%d', utc=True).time()

def set_am_pm(end_date):
    if 5 < pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').hour < 17:
        return "am"
    else:
        return "pm"

def average_corrections_am_pm_values(dataframe):

    retroDF = dataframe.groupby(['name','ET_Date', 'am/pm'], as_index=False)["totalRetroCorrections"].mean()
    nearKeyDF = dataframe.groupby(['name','ET_Date', 'am/pm'], as_index=False)["totalNearKeyCorrections"].mean()
    SubstitutionDF = dataframe.groupby(['name','ET_Date', 'am/pm'], as_index=False)["totalSubstitutionCorrections"].mean()
    SpaceDF = dataframe.groupby(['name','ET_Date', 'am/pm'], as_index=False)["totalSpaceCorrections"].mean()

    correctionsDF = retroDF
    correctionsDF = correctionsDF.merge(nearKeyDF, how='left', on=['name', 'ET_Date', 'am/pm'])
    correctionsDF = correctionsDF.merge(SubstitutionDF, how='left', on=['name', 'ET_Date', 'am/pm'])
    correctionsDF = correctionsDF.merge(SpaceDF, how='left', on=['name', 'ET_Date', 'am/pm'])

    return correctionsDF

def average_errors_am_pm_values(dataframe):

    shortCharUP = dataframe.groupby(['name','ET_Date', 'am/pm'], as_index=False)["shortWordCharKeyUpErrorDistance"].mean()
    shortWordDown = dataframe.groupby(['name','ET_Date', 'am/pm'], as_index=False)["shortWordCharKeyDownErrorDistance"].mean()
    spaceUP = dataframe.groupby(['name','ET_Date', 'am/pm'], as_index=False)["spaceUpErrorDistance"].mean()

    errorDF = shortCharUP
    errorDF = errorDF.merge(shortWordDown, how='left', on=['name', 'ET_Date', 'am/pm'])
    errorDF = errorDF.merge(spaceUP, how='left', on=['name', 'ET_Date', 'am/pm'])

    return errorDF


___
Loop over raw data from myDataHelps to make an organized dataframe
- Extracting totalWords and WordSentiments
- Extracting totalEmojis and emojiSentiments
___

In [9]:
## Loop over all the exported data folders/directories
from json import JSONDecodeError
emojiSentimentList = []
wordSentimentList = []

for folder in os.listdir(directory):
    path = directory + folder + "/" + metric_folder
    
    if folder == ".DS_Store":
        continue
    
    for participant in os.listdir(path):
        ParticipantIdentifier = participant
        pFolder = path + "/" + participant
    
        for data_folder in os.listdir(pFolder):
            
            final_path = pFolder + "/" + data_folder
            
            gz_extract(final_path)

            ## print(path)
            ## Loop over all files in this path/directory
            for fname in os.listdir(final_path):
                
                filename = ""

                ## name of the file
                if fname.endswith("json"):
                    filename = final_path + "/" + fname
                else: 
                    continue
                
                ## Load the JSON File
                file = open(filename)
                
                # print(filename)
                ## Need to use json.load and not json.loads
                loaded_file = ""
                
                try:
                    loaded_file = json.load(file)
                except JSONDecodeError:
                    continue

                ## Get the samples list
                samples = loaded_file["samples"]

                ## Get the name
                name = loaded_file["device"]["name"]

                ## Need a loop here to iterate over all samples
                for i in range(len(samples)):

                    ## Get the TimeStamp for the current sample
                    timeStamp = samples[i]["timestamp"]

                    ## Get the sample dictionary
                    sample_dict_iterator = samples[i]["sample"]

                    ## Collect sentiment data from this sample
                    totalWordsTemp = sample_dict_iterator["totalWords"];
                    totalEmojisTemp = sample_dict_iterator["totalEmojis"];
                    sentimentDict = sample_dict_iterator["sentimentMetrics"];
                    
                    ## Collect the emoji sentiments
                    emojiSentiments = sentimentDict["emojiCount"];
                    emojiCountList = get_sentiment_list(emojiSentiments)

                    wordSentiments =  sentimentDict["wordCount"]
                    wordCountList =  get_sentiment_list(wordSentiments)


                    emojiDictTemp = {
                        "name": name,
                        "ParticipantIdentifier": participant,
                        "TotalEmojis": totalEmojisTemp,
                        "timeStamp": timeStamp,
                        "emojiAbsolutionist": emojiCountList[0], "emojiAnger": emojiCountList[4], "emojiAnxiety": emojiCountList[3], 
                        "emojiConfused": emojiCountList[9], "emojiDeath": emojiCountList[2], "emojiDown": emojiCountList[1],
                        "emojiHealth": emojiCountList[5], "emojiLowEnergy": emojiCountList[8], "emojiPositive": emojiCountList[6],
                        "emojiSad": emojiCountList[7]
                    }

                    wordDicttemp = {
                        "name": name,
                        "TotalWords": totalWordsTemp,
                        "ParticipantIdentifier": participant,
                        "timeStamp": timeStamp,
                        "wordAbsolutionist": wordCountList[0], "wordAnger": wordCountList[4], "wordAnxiety": wordCountList[3], 
                        "wordConfused": wordCountList[9], "wordDeath": wordCountList[2], "wordDown": wordCountList[1],
                        "wordHealth": wordCountList[5], "wordLowEnergy": wordCountList[8], "wordPositive": wordCountList[6], 
                        "wordSad": wordCountList[7]
                    }

                    emojiSentimentList.append(emojiDictTemp)
                    wordSentimentList.append(wordDicttemp)


___
- Create the emoji Sentiment Dataframe
- Create the word Sentiment Dataframe
___

In [10]:
emojiSentimentDF = pd.DataFrame(emojiSentimentList)
wordSentimentDF = pd.DataFrame(wordSentimentList)

# Get date
emojiSentimentDF["trial_date"] = emojiSentimentDF.apply(lambda x: fix_date(x["timeStamp"]), axis=1)
wordSentimentDF["trial_date"] = wordSentimentDF.apply(lambda x: fix_date(x["timeStamp"]), axis=1)


___
Sum samples pertaining to the same participant and trial date
___

In [11]:
import warnings
warnings.filterwarnings('ignore')
emojis = emojiSentimentDF.groupby(['trial_date', 'ParticipantIdentifier', 'name'], as_index =False).sum()
words = wordSentimentDF.groupby(['trial_date', 'ParticipantIdentifier', 'name'], as_index =False).sum()

___
Filter for a certain emoji and word count
___

In [12]:
emojis = emojis.loc[emojis['TotalEmojis'] >= 15]
## words = words.loc[words['TotalWords'] >= 200]

___
Load the affect DF
- Which is created to raw data from myDataHelps using the self_report.ipynb notebook
___

In [13]:
affectDF = pd.read_csv("/Users/farhan/DNL/BuddingScholar/Budding_Scholar_22-23/output_tables/self_report.csv")
## Only keep affect scores in df
col = [x for x in affectDF.columns if not x.endswith("gap")]
affectDF = affectDF[col]

In [14]:

affectDF['trial_date'] = affectDF['trial_date'].astype(str)
emojis['trial_date'] = emojis['trial_date'].astype(str)
words['trial_date'] = words['trial_date'].astype(str)


In [43]:
verifier = words.loc[words.ParticipantIdentifier == "90592e06-bcf6-4150-85b0-c5daf7e7569c"]
verifier = verifier.loc[verifier.trial_date == "2022-10-14"]
verifier

Unnamed: 0,trial_date,ParticipantIdentifier,name,TotalWords,wordAbsolutionist,wordAnger,wordAnxiety,wordConfused,wordDeath,wordDown,wordHealth,wordLowEnergy,wordPositive,wordSad
503,2022-10-14,90592e06-bcf6-4150-85b0-c5daf7e7569c,iPhone,502,5,1,1,0,1,2,1,0,9,0


___
- Merge the affect and words DF
- Merge the affect and emojis DF
___

In [15]:
final_df_words = words
final_df_emojis = emojis

# final_df = final_df.merge(words, how='left', on=['ParticipantIdentifier', 'trial_date', 'name'])
final_df_words = final_df_words.merge(affectDF, how='left', on=['ParticipantIdentifier', 'trial_date']).reset_index(drop=True)
final_df_emojis = final_df_emojis.merge(affectDF, how='left', on=['ParticipantIdentifier', 'trial_date']).reset_index(drop=True)

final_df_words.head(2)

Unnamed: 0,trial_date,ParticipantIdentifier,name,TotalWords,wordAbsolutionist,wordAnger,wordAnxiety,wordConfused,wordDeath,wordDown,...,SR_affect_pos_focused,SR_affect_pos_focused_am,SR_affect_pos_happy,SR_affect_pos_happy_am,SR_affect_pos_hopeful,SR_affect_pos_hopeful_am,SR_affect_pos_motivated,SR_affect_pos_motivated_am,SR_affect_pos_relaxedCalm,SR_affect_pos_relaxedCalm_am
0,2022-10-02,14b58072-ae3b-491e-a8ca-207f0d27ccf6,iPhone,4377,40,6,8,0,7,8,...,,,,,,,,,,
1,2022-10-03,14b58072-ae3b-491e-a8ca-207f0d27ccf6,iPhone,1298,20,3,1,0,7,3,...,,,,,,,,,,


___
Multi Level modelling
- Fixed effects occur due to our independent variable
- Random effects occur due to clustering of data
    - We can treat effect due to our repeated measures variable as a random effect
    - So in our case, we can treat our participants as a source of random effects in our model

Random effects models can be randon slope, random intercept, or both
- Random intercept only = The random effects introduce different intercepts amongst groups (most common)
- Randon slope = The random effects introduce different slopes (not common)
- Can have a model where the intercepts and slopes can both vary (best approach)
___

____
- Use a multi level model with participant identifier as the grouping variable
- Model most likely uses an analogue of random intercept method
___

In [20]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [22]:
def fit_random_intercept_model(dataframe, grouping_variable, independent_variable, dependent_variable):
    col = ["trial_date", grouping_variable, independent_variable, dependent_variable]
    specificDF = dataframe[col].dropna()
    # print(specificDF)

    description = ""
    description = description + independent_variable + " ~ " + dependent_variable

    md = smf.mixedlm(description, specificDF, groups=specificDF[grouping_variable])
    mdf = md.fit()
    return mdf


In [23]:
affect_measure_list = ['affect_neg_angry', 'affect_neg_ashamed', 'affect_neg_bored', 'affect_neg_depressed', 
               'affect_neg_embarrassed', 'affect_neg_frustrated', 'affect_neg_guilty', 'affect_neg_lazy',
               'affect_neg_lonelyIsolated', 'affect_neg_nervousAnxious', 'affect_neg_sad', 'affect_neg_stressed',
               'affect_pos_amused', 'affect_pos_appreciated', 'affect_pos_excited', 'affect_pos_focused', 
               'affect_pos_happy', 'affect_pos_hopeful', 'affect_pos_motivated', 'affect_pos_relaxedCalm']

___
Find correlation between Total Words and Affect Measures
___

In [24]:
totalWordsxAffectCorrResults = []
for item in affect_measure_list:
    totalWordsxAffectCorrResults.append(fit_random_intercept_model(final_df_words, "ParticipantIdentifier", "SR_" + item, "TotalWords"))

___
Find correlation between Total Emojis and Affect Measures
- Use a multi level model with participant identifier as the grouping variable
- Model uses random intercept method
___

In [25]:
totalEmojisxAffectCorrResults = []
for item in affect_measure_list:
    totalEmojisxAffectCorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "TotalEmojis"))

___
Find correlation between emojiCounts sentiment metrics and Affect Measures
___

In [26]:
emojiSad_x_Affect_CorrResults = []
for item in affect_measure_list:
    emojiSad_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "emojiSad"))

In [27]:
emojiAnger_x_Affect_CorrResults = []
for item in affect_measure_list:
    emojiAnger_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "emojiAnger"))

In [28]:
emojiAnxiety_x_Affect_CorrResults = []
for item in affect_measure_list:
    emojiAnxiety_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "emojiAnxiety"))

In [29]:
emojiHealth_x_Affect_CorrResults = []
for item in affect_measure_list:
    emojiHealth_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "emojiHealth"))

In [30]:
emojiConfused_x_Affect_CorrResults = []
for item in affect_measure_list:
    emojiConfused_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "emojiConfused"))

In [32]:
emojiLowEnergy_x_Affect_CorrResults = []
for item in affect_measure_list:
    emojiLowEnergy_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "emojiLowEnergy"))

In [33]:
emojiPositive_x_Affect_CorrResults = []
for item in affect_measure_list:
    emojiPositive_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_emojis, "ParticipantIdentifier", "SR_" + item, "emojiPositive"))

___
Find correlation between wordCounts sentiment metrics and Affect Measures
___

In [34]:
# wordSad_x_Affect_CorrResults = []
# for item in affect_measure_list:
#     wordSad_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_words, "ParticipantIdentifier", "SR_" + item, "wordSad"))

In [35]:
wordAnger_x_Affect_CorrResults = []
for item in affect_measure_list:
    wordAnger_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_words, "ParticipantIdentifier", "SR_" + item, "wordAnger"))

In [36]:
wordAnxiety_x_Affect_CorrResults = []
for item in affect_measure_list:
    wordAnxiety_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_words, "ParticipantIdentifier", "SR_" + item, "wordAnxiety"))

In [37]:
wordHealth_x_Affect_CorrResults = []
for item in affect_measure_list:
    wordHealth_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_words, "ParticipantIdentifier", "SR_" + item, "wordHealth"))

In [38]:
# wordConfused_x_Affect_CorrResults = []
# for item in affect_measure_list:
#     wordConfused_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_words, "ParticipantIdentifier", "SR_" + item, "wordConfused"))

In [39]:
# wordLowEnergy_x_Affect_CorrResults = []
# for item in affect_measure_list:
#     wordLowEnergy_x_Affect_CorrResults.append(fit_random_intercept_model(final_df_words, "ParticipantIdentifier", "SR_" + item, "wordLowEnergy"))

___
Print the desired findings
___

In [None]:
printer = totalWordsxAffectCorrResults ## Change this line to which model results need to be visualized
for item in printer:
    print(item.summary())

___
Use pearson correlation to see correlations between variables within participant
- To be done again
___