In [1]:
import csv
import torch
print(torch.__version__)
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import seaborn as sns
import scipy.stats as stats
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics
from sklearn import cross_validation
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
import random
import operator
%matplotlib inline

#ignore all warnings
import warnings
warnings.filterwarnings("ignore")

import datetime
from math import radians, cos, sin, asin, sqrt
from operator import itemgetter

from collections import OrderedDict

import os

0.2.0_4


In [57]:
torch.manual_seed(1)    # reproducible

<torch._C.Generator at 0x7f02d06ee1b0>

In [6]:
class Time:
    
    @staticmethod
    def get_epoch_date_from(millis):
        return(millis / (24*60*60*1000))
    
    @staticmethod
    def get_date_from(millis):
        return(datetime.datetime.fromtimestamp(millis/1000).strftime('%Y-%m-%d'))
    
    @staticmethod
    def get_date_time_from(millis):
        return(datetime.datetime.fromtimestamp(millis/1000).strftime('%Y-%m-%d %H:%M:%S'))


In [7]:
class Distance:
    
    @staticmethod
    def haversine(p1, p2):
        # get lanlons
        lat1, lon1 = p1[0], p1[1]
        lat2, lon2 = p2[0], p2[1]

        # convert decimal degrees to radians 
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers. Use 3956 for miles
        return c * r


In [8]:
##### read data  ###

class LocationData:
    
    
    def __init__(self):
        self.locations = []
        self.header = []
        self.users = list() 
        self.ts_date_multiplier_value = 24 * 60 * 60* 1000
        self.interval_value = -1
        self.day_samples_count = -1
        
    
    # precision_value = 3 ## neighborhood, 4 ## street
    def readLocationData(self, file_path="moodtraces_data/location.csv", precision_value=3):
        f = open(file_path, "rb")
        reader = csv.reader(f)
        self.header = reader.next() 
        self.locations = list(row for row in reader)
        f.close()
        
        ### arranging the columns as similar to mytraces ###
        #mytraces: uuid, latitude, longitude, time, provider, accuracy 
        #moodtraces: uuid, time, latitude, longitude, accuracy
        locations2 = []
        for loc in self.locations:
            item = [loc[0], loc[2], loc[3], loc[1], 'gps', loc[4]]
            locations2.append(item)
        self.locations = locations2
        

        ### removing location points with accuracy more than 50m ###
        self.locations = list(loc for loc in self.locations if float(loc[5]) <= 50)

        ### adding date column  ###
        for loc in self.locations:
            loc.append(Time.get_epoch_date_from(int(loc[3])))
        self.header.append('date')

        ##### reduce precision of location points & converting to absolute values ###
        for loc in self.locations:
            loc[1] = round(float(loc[1]), precision_value)
            loc[2] = round(float(loc[2]), precision_value)
    
        # set up users
        self.users = list(set(loc[0] for loc in self.locations))
    
    
    
    
    # this function removes users with data for less than the given number of days
    def filterUser(self, min_days):
        filtered_users = []
        for u in self.users:
            user_data = list(loc for loc in self.locations if loc[0] == u)
            dates = list(set(loc[6] for loc in user_data))
            if len(dates) >= min_days:
                filtered_users.append(u)
        self.users = filtered_users


        
        
    # interval_value in millis
    def convertToEqualTimeSeries(self, interval_value = 30 * 60 * 1000):

        # set up interval values
        self.interval_value = interval_value
        self.day_samples_count = self.ts_date_multiplier_value / interval_value
        
        # converting time values to be divisor of interval_value
        locations_unequal_ts = list()
        for loc in self.locations:
            item = []
            item.append(loc[0])
            item.append(loc[1])
            item.append(loc[2])
            item.append(loc[3])
            item.append(loc[4])
            item.append(loc[5])
            item.append(loc[6])
            item[3] = int( int(item[3]) / interval_value) * interval_value
            locations_unequal_ts.append(item)


        location_ts = list()
        for u in self.users:
            user_data = list(loc for loc in locations_unequal_ts if loc[0] == u)
            dates = list(set(loc[6] for loc in user_data))
            for d in dates:
                d_data = list(loc for loc in user_data if loc[6] == d)
                for t in range(d*self.ts_date_multiplier_value, (d+1)*self.ts_date_multiplier_value, self.interval_value):
                    loc_t = [item for item in self.getClostestLocValue(d_data, t)]
                    loc_t[3] = t # change the time value
                    location_ts.append(loc_t)
        self.locations = location_ts
                    
                    
        
    # finds the last closest location sample (used below)
    def getClostestLocValue(self, locs, time):
        diff = [l[3] - time for l in locs]
        index = -1
        if(any(i == 0 for i in diff)): #there is an element at exact time
            index = diff.index(0)
        else:
            if(all(i > 0 for i in diff)): #no element before this time
                index = diff.index(min(diff))
            else:
                diff2 = [i for i in diff if i < 0]
                index = diff.index(max(diff2))
        return locs[index]
    
    
    def createUserDailyTrajectoryLists(self):
        user_lists = list()
        for u in self.users:
            user_data = list(loc for loc in self.locations if loc[0] == u)
            dates = list(set(loc[6] for loc in user_data))
            for d in dates:
                d_data = list(loc for loc in user_data if loc[6] == d)
                d_data = sorted(d_data, key=itemgetter(3)) # sorting with time
                user_lists.append([u,d,list(loc[1:3] for loc in d_data)])
        return user_lists
        

In [58]:
class PhqData:
    
    def __init__(self):
        self.header = []
        self.moods = []
        
        
    # precision_value = 3 ## neighborhood, 4 ## street
    def readPhqData(self):
        f = open("moodtraces_data/phq_raw.csv", "rb")
        reader = csv.reader(f)
        self.header = reader.next() 
        self.moods = list(row for row in reader)
        f.close()
        
        ### adding date column  ###
        for m in self.moods:
            m.append(Time.get_epoch_date_from(int(m[1])))
        self.header.append('date')
        
        
    def computeScores(self, users, min_days):
        ### assigning speed index ###
        median_time = np.median([float(m[11]) for m in self.moods])
        for m in self.moods:
            if float(m[11]) <= 0:
                m.append(0)
            elif float(m[11]) > median_time:
                m.append(1)
            else:
                m.append(float(m[11])/median_time)
        self.header.append('SI')
        
        ### filter with time less than 0.5 ###
        print "Initial user count", len(self.moods)
        self.moods = [m for m in self.moods if m[13] >= 0.5]
        print "User count", len(self.moods)
        print "Min time ratio", min([m[13] for m in self.moods])
        
        ### compute scores ###
        self.phq_scores = []
        user_count = 0
        response_threshold = 11 ## i.e., 80% of 14 days
        for u in users:
            u_data = [m for m in self.moods if m[0] == u]
            u_dates = sorted(set(m[12] for m in u_data))
            
            if len(u_dates) < min_days:
                continue
            
            user_count += 1
            for i in range(13, len(u_dates)):
                d = u_dates[i] # current day
                d_data = [m for m in u_data if m[12] == d]
                
                # check if there is data for this date
                if len(d_data) < 1:
                    continue
                    
                # get day of week
                day_of_week = int(d_data[0][2])
                
                phq_days = range((d-13), (d+1)) # list of current & past 13 days
                # get user data for phq_days
                phq_days_data = [m for m in u_data if m[12] in phq_days]
                
                # check if there is enough data to compute phq score for this date
                if len(phq_days_data) < response_threshold:
                    continue
                
                ### computing score for phq_days_data ###
                scores = []
                for j in range(3,11):
                    j_col = [m[j] for m in phq_days_data]
                    
                    true_count = 0
                    for item in j_col:
                        if item == 'true':
                            true_count += 1
                    
                    # the PHQ-8 scores for 0–1, 2–6, 7–11 and 12–14 days are 0, 1, 2 and 3
                    # for 1-week it is 0–1 days, 2–3 days, 4–5 days and 6–7 days are 0, 1, 2 and 3 
                    if true_count <= 1:
                        scores.append(0)
                    elif true_count <= 6:
                        scores.append(1)
                    elif true_count <= 11:
                        scores.append(2)
                    else:
                        scores.append(3)
                        
                i_result = [u, d, sum(scores), day_of_week]
                self.phq_scores.append(i_result)
        self.header = ['uuid','date','score','day_of_week']
        print "Number of users", user_count

        
    def computeDivergence(self):
        # get users
        users = list(set(m[0] for m in self.phq_scores))
        
        for u in users:
            u_data = [m for m in self.phq_scores if m[0] == u]
            
            # get avg phq values for each day of week 
            u_avg_phq = [0]*7
            for i in range(1,8):
                i_scores = [m[2] for m in u_data if m[3] == i]
                u_avg_phq[i-1] = np.mean(i_scores)
            
            # compute divergence
            for ud in u_data:
                ud[2] = ud[2] - u_avg_phq[(ud[3]-1)]
        
    def convertTo2Labels(self, sd_ratio):
        # get users
        users = list(set(m[0] for m in self.phq_scores))
        
        for u in users:
            u_data = [m for m in self.phq_scores if m[0] == u]
            
            # get avg and sd of phq scores  
            scores = [ud[2] for ud in u_data]
            u_avg_phq = np.mean(scores)
            u_sd_phq = np.std(scores)
            # compute divergence
            for ud in u_data:
                label = 0
                if ud[2] > (u_avg_phq + u_sd_phq*sd_ratio):
                    label = 1
                ud.append(label)
        
        a = [d for d in self.phq_scores if d[4] == 1]
        print "Pos count:", len(a)
        print "Neg count:", (len(self.phq_scores)-len(a))
        

In [None]:
class PhqScoreConverter: 
    
    def __init__(self):
        self.overall_avg_phq = 0
        self.overall_sd_phq = 0
        
                
    def removeCyclicEffect(self, phq_scores):
        # get avg phq values for each day of week 
        avg_phq_7_days = [0]*7
        for i in range(1,8):
            i_scores = [m[2] for m in phq_scores if m[3] == i]
            avg_phq_7_days[i-1] = np.mean(i_scores)
            
        for ud in phq_scores:
            ud[2] = ud[2] - avg_phq_7_days[(ud[3]-1)]
        return phq_scores
                
    def computeAvgSd(self,phq_scores):
        scores = [m[2] for m in phq_scores]
        self.overall_avg_phq = np.mean(scores)
        self.overall_sd_phq = np.std(scores)
        
        
        
    def convertTo2Labels(self, phq_scores, sd_ratio=0):
        # compute labels
        for ud in phq_scores:
            label = 0
            if ud[2] > self.overall_avg_phq + self.overall_sd_phq*sd_ratio:
                label = 1
            ud.append(label)
        return phq_scores
        
        
    def convertTo5Labels(self, phq_scores):
        
        # compute labels
        for ud in phq_scores:
            label = -1
            if ud[2] in [0,1,2,3,4,5]:
                label = 1
            elif ud[2] in [6,7,8,9,10]:
                label = 2
            elif ud[2] in [11,12,13,14,15]:
                label = 3
            elif ud[2] in [16,17,18,19,20]:
                label = 4
            elif ud[2] in [21,22,23,24]:
                label = 5
            
            ud.append(label)
        return phq_scores
    

In [36]:
class PhqLocMerge:
    
    def __init__(self):
        self.data = []
        
        
        
    def mergeData(self, daily_traj, phq_data, t_hist=14):
        
        ### merging phq to user_lists ###
        for pd in phq_data:
            d = pd[1] # current day
            days = range((d-(t_hist-1)), (d+1)) # list of current & past t_hist-1 days
            trajectories = [traj for traj in daily_traj if traj[0] == pd[0] and traj[1] in days]
            trajectories = sorted(trajectories, key = lambda x: int(x[1])) # sort with date
            trajectories = [traj[2] for traj in trajectories] # extract only trajectories
            
            # skip if not enough data is there
            response_threshold = int(t_hist*0.7) ## i.e., 70% of t_hist days
            if t_hist == 1:
                response_threshold = 1 # we need min of 1 day
            if len(trajectories) < response_threshold: 
                continue
            
            ul_item = []
            for i in range(len(pd)):
                ul_item.append(pd[i])
            ul_item.append(trajectories)
            self.data.append(ul_item)
            
        
#         ### merging phq to user_lists ###
#         for pd in phq_data:
#             d = pd[1] # current day
#             days = range((d-13), (d+1)) # list of current & past 13 days
#             trajectories = [traj for traj in daily_traj if traj[0] == pd[0] and traj[1] in days]
#             trajectories = sorted(trajectories, key = lambda x: int(x[1])) # sort with date
#             trajectories = [traj[2] for traj in trajectories] # extract only trajectories
            
#             # skip if not enough data is there
#             response_threshold = 10 ## i.e., 80% of 14 days
#             if len(trajectories) < response_threshold: 
#                 continue
            
#             ul_item = []
#             for i in range(len(pd)):
#                 ul_item.append(pd[i])
#             ul_item.append(trajectories)
#             self.data.append(ul_item)
            
        

In [None]:
class PhqLocMergeForDivergence:
    
    def __init__(self):
        self.data = []
        
        
        
    def mergeData(self, daily_traj, phq_data):
        
        ### merging phq to user_lists ###
        
        for pd in phq_data:
            d = pd[1] # current day
            days = range((d-13), (d+1)) # list of current & past 13 days
            trajectories = [traj for traj in daily_traj if traj[0] == pd[0] and traj[1] in days]
            trajectories = sorted(trajectories, key = lambda x: int(x[1])) # sort with date
            trajectories = [traj[2] for traj in trajectories] # extract only trajectories
            
            # skip if not enough data is there
            response_threshold = 10 ## i.e., 80% of 14 days
            if len(trajectories) < response_threshold: 
                continue
            
            ul_item = []
            for i in range(len(pd)):
                ul_item.append(pd[i])
            ul_item.append(trajectories)
            self.data.append(ul_item)
            
        
#         a = [d for d in self.data if d[4] == 1]
#         print "Pos count:", len(a)
#         print "Neg count:", (len(self.data)-len(a))