In [None]:
import json
import pandas as pd
import math

%matplotlib inline

### Commandos to remember

In [None]:
help(pd)

# Loading the data

In [None]:
# reading the JSON data using json.load()
file = './data/GoogleLocation.json'
with open(file) as train_file:
    data_json = json.load(train_file)

In [None]:
# show the raw data
data_json

In [None]:
# create the data frame
#df = pd.DataFrame({'raw': data_json['locations']})
df = pd.DataFrame({'raw': data_json['locations']})

# Verify number of data points
df.size

In [None]:
df.head()

# Extracting the basic data (timestamp, latitude, longitude, ...)

In [None]:
# Extract the base data
df['timestampMs'] = df.apply(lambda row: int(row['raw']['timestampMs']), axis=1)
df['latitudeE7'] = df.apply(lambda row: row['raw']['latitudeE7'], axis=1)
df['longitudeE7'] = df.apply(lambda row: row['raw']['longitudeE7'], axis=1)
df['accuracy'] = df.apply(lambda row: row['raw']['accuracy'], axis=1)

# Derivatives
df['date'] = pd.to_datetime(df['timestampMs'], unit='ms')
df['year'] = df.apply(lambda row: row['date'].year, axis=1)
df['month'] = df.apply(lambda row: row['date'].month, axis=1)
df['day'] = df.apply(lambda row: row['date'].day, axis=1)

df['time_of_day'] = (df['timestampMs'] % (1000*60*60*24)) / (1000*60*60)
df['date_diff'] = df['timestampMs'].diff()*-1

df.tail(10)

In [None]:
df.hist(column=['time_of_day'], bins=100)

# Plotting the locations

In [None]:
df.plot(x='longitudeE7', y='latitudeE7', figsize=(16, 10), kind='scatter')

In [None]:
df.boxplot(column=['accuracy'], showfliers=False)

In [None]:
df.boxplot(column=['date_diff'], showfliers=False)

# Activity

In [None]:
# show the raw data
data_json

In [None]:
selected_data = data_json['locations'][0:1000]

def extract_activities(i):
    activity = selected_data[i].get('activity')
    
    # No activity registered
    if(activity == None):
        return []
    
    return [[i, j] for j in range(0, len(activity))]

activities = [activity for i in range(0, len(selected_data)) for activity in extract_activities(i)]

df_activities = pd.DataFrame({'source': activities})

df_activities.head()

In [None]:
def to_date(timestamp):
    return pd.to_datetime(timestamp, unit='ms')

df_activities['gps_i'] = df_activities.apply(lambda row: row['source'][0], axis=1)
df_activities['act_i'] = df_activities.apply(lambda row: row['source'][1], axis=1)
df_activities['gps_time'] = df_activities.apply(lambda row: selected_data[row['gps_i']]['timestampMs'], axis=1)
df_activities['gps_time_next'] = df_activities.apply(lambda row: selected_data[row['gps_i']-1]['timestampMs'], axis=1)
df_activities['gps_time_prev'] = df_activities.apply(lambda row: selected_data[row['gps_i']+1]['timestampMs'], axis=1)

df_activities['raw'] = df_activities.apply(lambda row: selected_data[row['gps_i']]['activity'][row['act_i']], axis=1)
df_activities['act_time'] = df_activities.apply(lambda row: row['raw']['timestampMs'], axis=1)

df_activities['gps_min_act'] = df_activities.apply(lambda row: int(row['gps_time'])-int(row['act_time']), axis=1)
df_activities['gps_prev_min_act'] = df_activities.apply(lambda row: int(row['gps_time_prev'])-int(row['act_time']), axis=1)
df_activities['gps_next_min_act'] = df_activities.apply(lambda row: int(row['gps_time_next'])-int(row['act_time']), axis=1)

df_activities.head()

In [None]:
df_activities.boxplot(column=['gps_min_act', 'gps_prev_min_act', 'gps_next_min_act'], showfliers=False)

In [None]:
# Count the number of activities per gps log

def activity_count(raw):
    activity = raw.get('activity')
    
    # No activity registered
    if(activity == None):
        return None
    
    return len(activity)

df['activity_count'] = df.apply(lambda row: activity_count(row['raw']), axis=1)

df.boxplot(column=['activity_count'])

In [None]:
# Collect all the different kinds of activities (checked on full data set)

def activity_filter(raw):
    activity = raw.get('activity')
    
    # No activity registered
    if(activity == None):
        return 1
    
    # Checked on full data set (does not contain extra activity types)
    known = ['UNKNOWN', 'TILTING', 'STILL', 'ON_FOOT', 'IN_VEHICLE', 'WALKING', 'IN_RAIL_VEHICLE', 'IN_ROAD_VEHICLE', 'ON_BICYCLE', 'RUNNING', 'EXITING_VEHICLE']
    
    for i in range(0, len(activity)):
        selected = activity[i]['activity']
        for j in range(0, len(selected)):
            if(selected[j]['type'] not in known):
                return selected[j]['type']
    
    return 2

df['activity_filter'] = df.apply(lambda row: activity_filter(row['raw']), axis=1)

counts = df['activity_filter'].value_counts()
counts.plot(kind = 'pie', y = 'N');

In [None]:
# Work in progress: select the most appropriate activity

def activity(raw):
    activity = raw.get('activity')
    
    # No activity registered
    if(activity == None):
        return None
    
    attempt = activity[0]['activity'][0]
    if(attempt['type'] == 'UNKNOWN'):
        attempt = activity[0]['activity'][1]
    return attempt

df['activity'] = df.apply(lambda row: activity(row['raw']), axis=1)