In [22]:
import csv
import os
import numpy as np
from get_data_at_location import get_data_at_location
from calculate_confusion_matrix import calculate_confusion_matrix
import math
import pickle
import pandas as pd
import datetime
from scipy import stats
from count_transitions import count_transitions
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from preprocess import *

save_results = True

data_dir = 'data/'
data_dir_orig = '/home/sohrob/Dropbox/Data/CS120/'

fsq_map = {'Nightlife Spot':'Nightlife Spot (Bar, Club)', 'Outdoors & Recreation':'Outdoors & Recreation',\
          'Arts & Entertainment':'Arts & Entertainment (Theater, Music Venue, Etc.)',\
          'Professional & Other Places':'Professional or Medical Office',\
          'Food':'Food (Restaurant, Cafe)', 'Residence':'Home', 'Shop & Service':'Shop or Store'}

# building one hot encoder for foursquare locations (as extra features)
state7 = np.array(fsq_map.values()+['Unknown'])
le = preprocessing.LabelEncoder()
le.fit(state7)
state7_code = le.transform(state7)
enc = OneHotEncoder()
enc.fit(state7_code.reshape(-1, 1))

subjects = os.listdir(data_dir)
# subjects = [subjects[0]]

for (cnt,subj) in enumerate(subjects):
    
    print str(cnt) + ' ' + subj
    
    subject_dir = data_dir + subj + '/'
    samples = os.listdir(subject_dir)
    
    # checking in the original directory if the subject has app data
    sensors = os.listdir(data_dir_orig+subj)
    if 'app.csv' in sensors:
        has_app_data = True
    else:
        has_app_data = False
    
    # initialization
    feature = pd.DataFrame()
    target = pd.DataFrame()
    
    ind_last = 0
    
    for (i,samp) in enumerate(samples):
        
        sensor_dir = subject_dir + samp + '/'
        sensors = os.listdir(sensor_dir)
        
        # reading semantic location data and skipping if it does not exist
        if 'eml.csv' in sensors:
            filename = sensor_dir+'eml.csv'
            data = pd.read_csv(filename, delimiter='\t', header=None)
            target.loc[ind_last, 'location'] = preprocess_location(data.loc[0,6], parse=False)
            target.loc[ind_last, 'reason'] = preprocess_reason(data.loc[0,7], parse=False)
        else:
            print 'subject {} does not have location report data at i. skipping'.format(subject,samp)
            continue
        
        if 'fsq2.csv' in sensors:
            data_fsq = pd.read_csv(sensor_dir+'fsq2.csv', delimiter='\t', header=None)
            loc_fsq = data_fsq.loc[10,1]
            distance_fsq = float(data_fsq.loc[11,1])
            
            # converting foursquare category name to standard name
            if loc_fsq in fsq_map:
                loc_fsq = fsq_map[loc_fsq]
            else:
                loc_fsq = 'Unknown'
                
        else:
            loc_fsq = 'Unknown'
            distance_fsq = np.nan
        
        target.loc[ind_last, 'fsq'] = loc_fsq
        
        ## sensor features
        # light
        if 'lgt.csv' in sensors:
            data = pd.read_csv(sensor_dir+'lgt.csv', delimiter='\t', header=None)
            lgt = data[:][1]
            feature.loc[ind_last, 'lgt mean'] = np.nanmean(lgt)
            feature.loc[ind_last, 'lgt std'] = np.nanstd(lgt)
            feature.loc[ind_last, 'lgt off'] = np.sum(lgt==0)/float(lgt.size)
            feature.loc[ind_last, 'lgt zcrossing'] = np.sum(np.diff(np.sign(lgt-np.nanmean(lgt))))/float(lgt.size)
            feature.loc[ind_last, 'lgt skew'] = stats.skew(lgt)
            feature.loc[ind_last, 'lgt kurt'] = stats.kurtosis(lgt)
        else:
            feature.loc[ind_last, 'lgt mean'] = np.nan
            feature.loc[ind_last, 'lgt std'] = np.nan
            feature.loc[ind_last, 'lgt off'] = np.nan
            feature.loc[ind_last, 'lgt zcrossing'] = np.nan
            feature.loc[ind_last, 'lgt skew'] = np.nan
            feature.loc[ind_last, 'lgt kurt'] = np.nan

        # audio
        if 'aud.csv' in sensors:
            data = pd.read_csv(sensor_dir+'aud.csv', delimiter='\t', header=None)
            feature.loc[ind_last, 'aud mean'] = np.nanmean(data[:][1])
            feature.loc[ind_last, 'aud std'] = np.nanstd(data[:][1])
            feature.loc[ind_last, 'aud skew'] = stats.skew(data[:][1])
            feature.loc[ind_last, 'aud kurt'] = stats.kurtosis(data[:][1])
            feature.loc[ind_last, 'aud frq mean'] = np.nanmean(data[:][2])
            feature.loc[ind_last, 'aud frq std'] = np.nanstd(data[:][2])
            feature.loc[ind_last, 'aud frq skew'] = stats.skew(data[:][2])
            feature.loc[ind_last, 'aud frq kurt'] = stats.kurtosis(data[:][2])
        else:
            feature.loc[ind_last, 'aud mean'] = np.nan
            feature.loc[ind_last, 'aud std'] = np.nan
            feature.loc[ind_last, 'aud skew'] = np.nan
            feature.loc[ind_last, 'aud kurt'] = np.nan
            feature.loc[ind_last, 'aud frq mean'] = np.nan
            feature.loc[ind_last, 'aud frq std'] = np.nan
            feature.loc[ind_last, 'aud frq skew'] = np.nan
            feature.loc[ind_last, 'aud frq kurt'] = np.nan


        # screen
        if 'scr.csv' in sensors:
            data = pd.read_csv(sensor_dir+'scr.csv', delimiter='\t', header=None)
            if data[:][0].size>=2:
                deltat = data[0][data[0][:].size-1] - data[0][0]
                if deltat!=0:
                    scr_dur = np.array([])
                    scr_frq = 0
                    for j in range(data[1][:].size-1):
                        if data[1][j]=='True' and data[1][j+1]=='False':
                            scr_dur = np.append(scr_dur, data[0][j+1]-data[0][j])
                            scr_frq += 1
                    feature.loc[ind_last, 'scr frq'] = scr_frq/float(deltat)
                    feature.loc[ind_last, 'scr dur mean'] = np.mean(scr_dur)
                    feature.loc[ind_last, 'scr dur std'] = np.std(scr_dur)
                else:
                    feature.loc[ind_last, 'scr frq'] = np.nan
                    feature.loc[ind_last, 'scr dur mean'] = np.nan
                    feature.loc[ind_last, 'scr dur std'] = np.nan
            else:
                feature.loc[ind_last, 'scr frq'] = 0
                feature.loc[ind_last, 'scr dur mean'] = 0
                feature.loc[ind_last, 'scr dur std'] = np.nan
        else:
            feature.loc[ind_last, 'scr frq'] = 0
            feature.loc[ind_last, 'scr dur mean'] = 0
            feature.loc[ind_last, 'scr dur std'] = np.nan
        
        # activity
        if 'act.csv' in sensors:
            data = pd.read_csv(sensor_dir+'act.csv', delimiter='\t', header=None)
            n = float(data[0][:].size)
            feature.loc[ind_last, 'still'] = np.sum(data[1][:]=='STILL')/n
            feature.loc[ind_last, 'tilting'] = np.sum(data[1][:]=='TILTING')/n
            feature.loc[ind_last, 'walking'] = np.sum(data[1][:]=='ONFOOT')/n
            feature.loc[ind_last, 'unknown act'] = np.sum(data[1][:]=='UNKNOWN')/n
            feature.loc[ind_last, 'still-walking'] = count_transitions(data[1][:],'STILL','ONFOOT')/n
            feature.loc[ind_last, 'still-tilting'] = count_transitions(data[1][:],'STILL','TILTING')/n
            feature.loc[ind_last, 'still-unknown'] = count_transitions(data[1][:],'STILL','UNKNOWN')/n
            feature.loc[ind_last, 'walking-unknown'] = count_transitions(data[1][:],'ONFOOT','UNKNOWN')/n
        else:
            feature.loc[ind_last, 'still'] = np.nan
            feature.loc[ind_last, 'tilting'] = np.nan
            feature.loc[ind_last, 'walking'] = np.nan
            feature.loc[ind_last, 'unknown act'] = np.nan
            feature.loc[ind_last, 'still-walking'] = np.nan
            feature.loc[ind_last, 'still-tilting'] = np.nan
            feature.loc[ind_last, 'still-unknown'] = np.nan
            feature.loc[ind_last, 'walking-unknown'] = np.nan
            
        # apps
        if 'app.csv' in sensors:
            data = pd.read_csv(sensor_dir+'app.csv', delimiter='\t', header=None)
            feature.loc[ind_last, 'messaging'] = np.sum(data[2][:]=='Messaging')
            feature.loc[ind_last, 'facebook'] = np.sum(data[2][:]=='Facebook')
            feature.loc[ind_last, 'chrome'] = np.sum(data[2][:]=='Chrome')
            feature.loc[ind_last, 'mobilyze'] = np.sum(data[2][:]=='Mobilyze')
            feature.loc[ind_last, 'phone'] = np.sum(data[2][:]=='Phone')
            feature.loc[ind_last, 'gmail'] = np.sum(data[2][:]=='Gmail')
            feature.loc[ind_last, 'contacts'] = np.sum(data[2][:]=='Contacts')
            feature.loc[ind_last, 'internet'] = np.sum(data[2][:]=='Internet')
            feature.loc[ind_last, 'gallery'] = np.sum(data[2][:]=='Gallery')
            feature.loc[ind_last, 'email'] = np.sum(data[2][:]=='Email')
            feature.loc[ind_last, 'settings'] = np.sum(data[2][:]=='Settings')
            feature.loc[ind_last, 'messenger'] = np.sum(data[2][:]=='Messenger')
            feature.loc[ind_last, 'camera'] = np.sum(data[2][:]=='Camera')
            feature.loc[ind_last, 'clock'] = np.sum(data[2][:]=='Clock')
            feature.loc[ind_last, 'maps'] = np.sum(data[2][:]=='Maps')
            feature.loc[ind_last, 'calendar'] = np.sum(data[2][:]=='Calendar')
            feature.loc[ind_last, 'youtube'] = np.sum(data[2][:]=='Youtube')
            feature.loc[ind_last, 'calculator'] = np.sum(data[2][:]=='Calculator')
            feature.loc[ind_last, 'purple robot'] = np.sum(data[2][:]=='Purple Robot')
            feature.loc[ind_last, 'system ui'] = np.sum(data[2][:]=='System UI')
        else:
            if has_app_data: # if not, leave them as NaN
                feature.loc[ind_last, 'messaging'] = 0
                feature.loc[ind_last, 'facebook'] = 0
                feature.loc[ind_last, 'chrome'] = 0
                feature.loc[ind_last, 'mobilyze'] = 0
                feature.loc[ind_last, 'phone'] = 0
                feature.loc[ind_last, 'gmail'] = 0
                feature.loc[ind_last, 'contacts'] = 0
                feature.loc[ind_last, 'internet'] = 0
                feature.loc[ind_last, 'gallery'] = 0
                feature.loc[ind_last, 'email'] = 0
                feature.loc[ind_last, 'settings'] = 0
                feature.loc[ind_last, 'messenger'] = 0
                feature.loc[ind_last, 'camera'] = 0
                feature.loc[ind_last, 'clock'] = 0
                feature.loc[ind_last, 'maps'] = 0
                feature.loc[ind_last, 'calendar'] = 0
                feature.loc[ind_last, 'youtube'] = 0
                feature.loc[ind_last, 'calculator'] = 0
                feature.loc[ind_last, 'purple robot'] = 0
                feature.loc[ind_last, 'system ui'] = 0
            else:
                feature.loc[ind_last, 'messaging'] = np.nan
                feature.loc[ind_last, 'facebook'] = np.nan
                feature.loc[ind_last, 'chrome'] = np.nan
                feature.loc[ind_last, 'mobilyze'] = np.nan
                feature.loc[ind_last, 'phone'] = np.nan
                feature.loc[ind_last, 'gmail'] = np.nan
                feature.loc[ind_last, 'contacts'] = np.nan
                feature.loc[ind_last, 'internet'] = np.nan
                feature.loc[ind_last, 'gallery'] = np.nan
                feature.loc[ind_last, 'email'] = np.nan
                feature.loc[ind_last, 'settings'] = np.nan
                feature.loc[ind_last, 'messenger'] = np.nan
                feature.loc[ind_last, 'camera'] = np.nan
                feature.loc[ind_last, 'clock'] = np.nan
                feature.loc[ind_last, 'maps'] = np.nan
                feature.loc[ind_last, 'calendar'] = np.nan
                feature.loc[ind_last, 'youtube'] = np.nan
                feature.loc[ind_last, 'calculator'] = np.nan
                feature.loc[ind_last, 'purple robot'] = np.nan
                feature.loc[ind_last, 'system ui'] = np.nan
            
        # communication
        if 'coe.csv' in sensors:
            data = pd.read_csv(sensor_dir+'coe.csv', delimiter='\t', header=None)
            feature.loc[ind_last, 'call in'] = np.sum(np.logical_and(data[3][:]=='PHONE',data[4][:]=='INCOMING'))
            feature.loc[ind_last, 'call out'] = np.sum(np.logical_and(data[3][:]=='PHONE',data[4][:]=='OUTGOING'))
            feature.loc[ind_last, 'sms in'] = np.sum(np.logical_and(data[3][:]=='SMS',data[4][:]=='INCOMING'))
            feature.loc[ind_last, 'sms out'] = np.sum(np.logical_and(data[3][:]=='SMS',data[4][:]=='OUTGOING'))
            feature.loc[ind_last, 'call missed'] = np.sum(data[4][:]=='MISSED')
        else:
            feature.loc[ind_last, 'call in'] = 0
            feature.loc[ind_last, 'call out'] = 0
            feature.loc[ind_last, 'sms in'] = 0
            feature.loc[ind_last, 'sms out'] = 0
            feature.loc[ind_last, 'call missed'] = 0
        
        # wifi
        if 'wif.csv' in sensors:
            data = pd.read_csv(sensor_dir+'wif.csv', delimiter='\t', header=None)
            feature.loc[ind_last, 'n wifi'] = np.mean(data[3][:])
        else:
            feature.loc[ind_last, 'n wifi'] = np.nan
        
        # weather
        if 'wtr.csv' in sensors:
            data = pd.read_csv(sensor_dir+'wtr.csv', delimiter='\t', header=None)
            wtr_cond = stats.mode(data[9][:])[0][0]
            if not isinstance(wtr_cond, basestring):
                wtr_cond = str(wtr_cond)
            feature.loc[ind_last, 'temperature'] = np.mean(data[1][:])
            feature.loc[ind_last, 'dew point'] = np.mean(data[3][:])
            feature.loc[ind_last, 'weather'] = sum(ord(c) for c in wtr_cond)
        else:
            feature.loc[ind_last, 'temperature'] = np.nan
            feature.loc[ind_last, 'dew point'] = np.nan
            feature.loc[ind_last, 'weather'] = np.nan
        
        # GPS and time
        if 'fus.csv' in sensors:
            data = pd.read_csv(sensor_dir+'fus.csv', delimiter='\t', header=None)
            t_start = data[0][0]
            t_end = data[0][data[0][:].size-1]
            feature.loc[ind_last, 'lat mean'] = np.mean(data[1][:])
            feature.loc[ind_last, 'lng mean'] = np.mean(data[2][:])
            feature.loc[ind_last, 'loc var'] = np.log(np.var(data[1][:])+np.var(data[2][:])+1e-16)
            feature.loc[ind_last, 'duration'] = t_end-t_start
            feature.loc[ind_last, 'midtime'] = ((t_end+t_start)/2.0)%86400
            feature.loc[ind_last, 'dow start'] = datetime.datetime.fromtimestamp(t_start).weekday()
            feature.loc[ind_last, 'dow end'] = datetime.datetime.fromtimestamp(t_end).weekday()
        else:
            feature.loc[ind_last, 'lat mean'] = np.nan
            feature.loc[ind_last, 'lng mean'] = np.nan
            feature.loc[ind_last, 'loc var'] = np.nan
            feature.loc[ind_last, 'duration'] = np.nan
            feature.loc[ind_last, 'midtime'] = np.nan
            feature.loc[ind_last, 'dow start'] = np.nan
            feature.loc[ind_last, 'dow end'] = np.nan
        
        # foursquare location in binary form
        loc_fsq_code = le.transform(loc_fsq)
        loc_fsq_bin = enc.transform(loc_fsq_code.reshape(-1,1)).toarray()
        loc_fsq_bin = loc_fsq_bin[0]
        for j in range(loc_fsq_bin.size):
            feature.loc[ind_last, 'fsq {}'.format(j)] = loc_fsq_bin[j]
        
        # distance to closest foursquare location (m)
        feature.loc[ind_last, 'fsq distance'] = distance_fsq
        
        ind_last += 1

    print feature.shape, target.shape
    if save_results:
        with open('features/features_'+subj+'.dat', 'w') as file_out:
            pickle.dump([feature, target], file_out)
        file_out.close()

# os._exit(0)

0 EW057EV
(78, 70) (78, 3)
1 MQ077WG
(117, 70) (117, 3)
2 1203725
(93, 70) (93, 3)
3 BD921DW
(240, 70) (240, 3)
4 1573207
(102, 70) (102, 3)
5 1578395
(120, 70) (120, 3)
6 1464458
(63, 70) (63, 3)
7 1135515
(82, 70) (82, 3)
8 1183252
(84, 70) (84, 3)
9 1521517
(134, 70) (134, 3)
10 1210517
(116, 70) (116, 3)
11 FM387DI
(112, 70) (112, 3)
12 952207
(731, 70) (731, 3)
13 1142152
(126, 70) (126, 3)
14 984221
(172, 70) (172, 3)
15 1553373
(148, 70) (148, 3)
16 1244644
(83, 70) (83, 3)
17 1327952
(124, 70) (124, 3)
18 1524496
(30, 70) (30, 3)
19 1559190
(118, 70) (118, 3)
20 1130955
(150, 70) (150, 3)
21 1535103
(89, 70) (89, 3)
22 1381257
(53, 70) (53, 3)
23 1041667
(98, 70) (98, 3)
24 1055808
(135, 70) (135, 3)
25 1483186
(53, 70) (53, 3)
26 1054952
(67, 70) (67, 3)
27 1197009
(181, 70) (181, 3)
28 1385032
(211, 70) (211, 3)
29 IK750RN
(161, 70) (161, 3)
30 IP417XX
(151, 70) (151, 3)
31 1564420
(137, 70) (137, 3)
32 1288818
(82, 70) (82, 3)
33 1571376
(113, 70) (113, 3)
34 1567871
(166, 7

In [38]:
feature

Unnamed: 0,lgt mean,lgt std,lgt off,lgt zcrossing,lgt skew,lgt kurt,aud mean,aud std,aud skew,aud kurt,...,dow end,fsq 0,fsq 1,fsq 2,fsq 3,fsq 4,fsq 5,fsq 6,fsq 7,fsq distance
0,120.000000,198.066908,0.125000,0.000000,1.154553,-0.665080,4.945625e-05,6.055538e-05,1.696857e+00,1.383159,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.0
1,134.381974,176.257091,0.216738,0.000000,1.704733,3.064648,5.184172e-06,2.150258e-05,8.570440e+00,82.649045,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
2,91.208333,47.371738,0.083333,0.000000,-1.295776,-0.087563,7.611884e-06,9.757461e-06,1.657361e+00,1.280365,...,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2820.0
3,844.459459,583.361952,0.135135,0.054054,0.679472,1.010065,1.030829e-04,7.036114e-05,3.714863e-01,-1.074268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,15.0
4,,,,,,,3.039063e-05,4.799619e-05,3.676142e+00,13.177940,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,325.0
5,21.291209,27.970507,0.538462,0.000000,1.135230,0.592428,6.665738e-06,5.154024e-05,9.654764e+00,96.695776,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
6,61.379061,106.216474,0.122744,0.000000,3.750393,14.829158,2.396252e-05,1.929619e-04,1.019012e+01,110.436299,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
7,0.666667,0.471405,0.333333,-0.666667,-0.707107,-1.500000,2.568629e-04,7.070317e-05,3.371000e-01,-1.500000,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
8,0.596154,1.376445,0.826923,0.000000,2.033717,2.457724,3.433037e-05,1.463248e-04,4.634349e+00,21.529713,...,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,401.0
9,0.000000,0.000000,1.000000,0.000000,0.000000,-3.000000,4.299818e-04,7.009664e-04,2.632588e+00,6.629473,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0


In [None]:
# spatial visualization
import matplotlib.pyplot as plt
%matplotlib inline
colors = plt.cm.jet(np.linspace(0,1,len(loc_uniq)))
plt.figure(figsize=(18,15))
plt.rcParams['figure.figsize'] = (10, 6)
plt.plot(np.array(lng_gps),np.array(lat_gps),'ko',alpha=0.1, markersize=12)
for i in range(len(loc_uniq)):
    inds = loc.index(loc_uniq[i])
    plt.plot(np.array(lng_report[inds]), np.array(lat_report[inds]), 'o', color=colors[i], alpha=1, markersize=12)
plt.legend(['gps']+loc_uniq, frameon=False, loc='center left', bbox_to_anchor=(0.6, 0.8))
plt.box()

In [None]:
# temporal visualization
from sklearn import preprocessing
print loc
le = preprocessing.LabelEncoder()
le.fit(loc)
loc_code = le.transform(loc)
plt.figure(figsize=(12,6))
plt.plot(loc_code,'.k',markersize=10)
plt.yticks(range(len(loc_uniq)), loc_uniq)
axes = plt.gca()
axes.set_xlim([0, len(loc_code)])
axes.set_ylim([-1, len(loc_uniq)])
print t_report

In [None]:
# temporal visualization
plt.figure(figsize=(12,6))
plt.plot(state_code,'.k',markersize=10)
plt.yticks(range(len(loc_uniq)), loc_uniq)
axes = plt.gca()
axes.set_xlim([0, len(state_code)])
axes.set_ylim([-1, len(loc_uniq)])
print loc_uniq

In [None]:
# distribution of features across locations
ft = 0
plt.figure(figsize=(18,8))
plt.plot(state_code+np.random.uniform(-.1,.1,len(state_code)), feature[:,ft],'.',markersize=20, alpha=.5)
axes = plt.gca()
axes.set_xlim([-.5, len(loc_uniq)-.5])
plt.xticks(range(len(loc_uniq)), loc_uniq)
