In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

%load_ext autoreload
%autoreload 2

In [2]:
DATADIR = Path('data')

In [3]:
print(os.listdir(DATADIR))

['df_location_ratio.csv', 'df_phq9.csv', 'df_baseline_phq9.csv', 'df_passive_mobility_features.csv', 'df_demographics.csv', 'data_analysis.ipynb']


# Preparing the Data for Training

## Reading the Data

In [4]:
phq9 = pd.read_csv(DATADIR / 'df_phq9.csv')
phq9.head()

Unnamed: 0.1,Unnamed: 0,participant_id,week,sum_phq9,phq9Date,phq9_1,phq9_2,phq9_3,phq9_4,phq9_5,phq9_6,phq9_7,phq9_8,phq9_9,phq9_sum,phq9_level,phq9_level_diff,phq9_sum_diff
0,0,BLUE-00048,1,7,2014-08-08,0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,5.0,1,,
1,1,BLUE-00050,2,7,2014-08-23,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,7.0,2,,
2,2,BLUE-00050,3,4,2014-08-30,1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,4.0,1,-1.0,-3.0
3,3,BLUE-00050,4,5,2014-09-08,1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,1,0.0,1.0
4,4,BLUE-00050,6,5,2014-09-22,1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,1,0.0,0.0


In [56]:
demographics = pd.read_csv(DATADIR / 'df_demographics.csv')
demographics.head()

Unnamed: 0.1,Unnamed: 0,participant_id,working,income_satisfaction,income_lastyear,age,startdate,study,gen_Female,gen_Male,...,edu_None,edu_University,race_African-American/Black,race_American Indian/Alaskan Native,race_Asian,race_Hispanic/Latino,race_More than one,race_Native Hawaiian/other Pacific Islander,race_Non-Hispanic White,race_Other
0,213,EN00033,1.0,3.0,6.0,52.0,2016-08-12 12:41:00,Brighten-v2,0,1,...,0,1,0,0,0,0,0,0,1,0
1,214,EN00034,1.0,3.0,4.0,32.0,2016-08-12 12:58:00,Brighten-v2,1,0,...,0,1,0,0,0,0,0,0,1,0
2,215,EN00035,0.0,2.0,2.0,57.0,2016-08-12 15:41:00,Brighten-v2,0,1,...,0,1,0,0,0,0,0,0,1,0
3,216,EN00036,1.0,3.0,3.0,55.0,2016-08-13 10:24:00,Brighten-v2,1,0,...,0,1,0,0,0,0,0,0,1,0
4,217,EN00037,0.0,2.0,6.0,34.0,2016-08-15 09:04:00,Brighten-v2,0,1,...,0,1,0,0,0,0,0,0,1,0


In [6]:
locations = pd.read_csv(DATADIR / 'df_location_ratio.csv')
locations.head()

Unnamed: 0,participant_id,date,automotive,consumer_goods,dining_out,education,entertainment,finance,government_offices,health,...,home_store,lodging,park,personal_services,place_of_mourning,place_of_worship,repair,supermarket,transit,work
0,EN00039,2016-09-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EN00039,2016-09-17,0.0,0.0,0.0,0.011111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EN00039,2016-11-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EN00039,2016-11-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EN00039,2016-11-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cleaning a bit More

In [7]:
phq9.rename(columns={'phq9Date': 'date'}, inplace=True)

In [57]:
phq9.head()

Unnamed: 0.1,Unnamed: 0,participant_id,week,sum_phq9,date,phq9_1,phq9_2,phq9_3,phq9_4,phq9_5,phq9_6,phq9_7,phq9_8,phq9_9,phq9_sum,phq9_level,phq9_level_diff,phq9_sum_diff,has_phq9
0,0,BLUE-00048,1,7,2014-08-08,0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,5.0,1,,,True
1,1,BLUE-00050,2,7,2014-08-23,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,7.0,2,,,True
2,2,BLUE-00050,3,4,2014-08-30,1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,4.0,1,-1.0,-3.0,True
3,3,BLUE-00050,4,5,2014-09-08,1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,1,0.0,1.0,True
4,4,BLUE-00050,6,5,2014-09-22,1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,1,0.0,0.0,True


## Joining the Tables

In [210]:
from data_processing import *
import data_processing as dp

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


TYPE = 'classification' # regression|classification
TARGET = 'value' # diff|value
SEED = 433

phq9 = dp.load_phq9_targets(DATADIR / 'df_phq9.csv', type=TYPE, target=TARGET)
locations = load_locations(DATADIR / 'df_location_ratio.csv')
demographics = load_demographics(DATADIR / 'df_demographics.csv')

combined = dp.combine(phq9, dailies=[('locations', locations)], constants=[demographics])
combined = dp.rf_preprocess(combined)

x, y = dp.xy_split(combined)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=SEED)

print('Train set shape:', x_train.shape)
print('Test set shape:', x_test.shape)
print()

if TYPE == 'regression':
    def rmse(x, y):
        return np.sqrt(((x - y)**2).mean())

    model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=SEED)
    model.fit(x_train, y_train)
    
    train_rmse = rmse(y_train, model.predict(x_train))
    test_rmse = rmse(y_test, model.predict(x_test))
    print(f'Train set RMSE: {train_rmse:.4f}')
    print(f'Test set RMSE:  {test_rmse:.4f}')
    
    print(f'Train score:', model.score(x_train, y_train))
    print(f'Test score:', model.score(x_test, y_test))
elif TYPE == 'classification':
    model = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=SEED)
    model.fit(x_train, y_train)
    
    train_acc = 100 * model.score(x_train, y_train)
    test_acc = 100 * model.score(x_test, y_test)
    print(f'Train set accuracy: {train_acc:.2f}%') 
    print(f'Test set accuracy:  {test_acc:.2f}%')

Train set shape: (986, 42)
Test set shape: (175, 42)

Train set accuracy: 94.52%
Test set accuracy:  49.71%


In [203]:
phq9.participant_id.unique().size
# combined.participant_id.unique().size
# locations.participant_id.unique().size
locations

Unnamed: 0,participant_id,date,automotive,consumer_goods,dining_out,education,entertainment,finance,government_offices,health,...,lodging,park,personal_services,place_of_mourning,place_of_worship,repair,supermarket,transit,work,has_locations
0,EN00039,2016-09-15,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,EN00039,2016-09-17,0.0,0.0,0.0,0.011111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,EN00039,2016-11-11,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,EN00039,2016-11-13,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,EN00039,2016-11-14,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10422,ES05073,2017-03-16,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
10423,ES05073,2017-03-17,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
10424,ES05073,2017-03-18,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
10425,ES05073,2017-03-20,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


#### Random Forest Trial

In [204]:
combined
combined.loc[:, 'automotive':'work']

Unnamed: 0,automotive,consumer_goods,dining_out,education,entertainment,finance,government_offices,health,home,home_store,lodging,park,personal_services,place_of_mourning,place_of_worship,repair,supermarket,transit,work
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156,0.0,0.0,0.000801,0.0,0.0,0.0,0.0,0.0,0.450850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1157,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.541909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1158,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.409618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1159,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.552518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


1.976435548356943 4.160818012698526


We need some kind of hand-made context. Since we're just focusing on dates actually having PHQ9 scores, how about averaging out the location features since the last phq9 to use as context for the current PHQ9? For each participant, of course.

In [75]:
# There's probably a faster pure pandas method, but whatever!
participants = daily.participant_id.unique()
out_rows = []

participants = ['EN00041']
for participant in participants:
    prows = daily[daily.participant_id == participant]
    if len(prows) == 1: # skip if there is no data
        continue
    prows
    break
prows

Unnamed: 0.1,Unnamed: 0,participant_id,week,sum_phq9,date,phq9_1,phq9_2,phq9_3,phq9_4,phq9_5,...,lodging,park,personal_services,place_of_mourning,place_of_worship,repair,supermarket,transit,work,has_location
4180,4180.0,EN00041,1.0,9.0,2016-08-19,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
4152,4152.0,EN00041,3.0,6.0,2016-09-02,1.0,1.0,0.0,2.0,0.0,...,,,,,,,,,,
3723,3723.0,EN00041,4.0,5.0,2016-09-09,0.0,1.0,0.0,1.0,0.0,...,,,,,,,,,,
4839,,EN00041,,,2016-09-12,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4840,,EN00041,,,2016-09-15,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4413,4413.0,EN00041,5.0,4.0,2016-09-16,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4841,,EN00041,,,2016-09-17,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4842,,EN00041,,,2016-09-18,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4843,,EN00041,,,2016-09-19,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4844,,EN00041,,,2016-09-20,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [73]:
daily[daily['has_location'].notna()]
p = daily[daily['has_location'].notna()].participant_id.unique()
p

array(['EN00039', 'EN00041', 'EN00046', 'EN00047', 'EN00052', 'EN00055',
       'EN00056', 'EN00057', 'EN00060', 'EN00061', 'EN00064', 'EN00065',
       'EN00067', 'EN00068', 'EN00071', 'EN00073', 'EN00076', 'EN00077',
       'EN00080', 'EN00082', 'EN00083', 'EN00084', 'EN00088', 'EN00089',
       'EN00091', 'EN00101', 'EN00102', 'EN00106', 'EN00111', 'EN00124',
       'EN00125', 'EN00127', 'EN00129', 'EN00130', 'EN00132', 'EN00136',
       'EN00138', 'EN00142', 'EN00145', 'EN00147', 'EN00150', 'EN00152',
       'EN00155', 'EN00156', 'EN00164', 'EN00168', 'EN00171', 'EN00178',
       'EN00181', 'EN00185', 'EN00192', 'EN00194', 'EN00196', 'EN00198',
       'EN00208', 'EN00218', 'EN00219', 'EN00221', 'EN00225', 'EN00228',
       'EN00229', 'EN00231', 'EN00232', 'EN00243', 'EN00249', 'EN00250',
       'EN00251', 'EN00252', 'EN00262', 'EN00263', 'EN00272', 'EN00275',
       'EN00276', 'EN00281', 'EN00285', 'EN00295', 'EN00301', 'EN00309',
       'EN00313', 'EN00319', 'EN00322', 'EN00325', 

In [164]:
phq9 = pd.read_csv(DATADIR / 'df_phq9.csv')
s = phq9.loc[:, 'phq9_1':'phq9_9'].sum(axis=1)