In [None]:
import json
import pandas as pd
import math
import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline

In [None]:
# reading the JSON data using json.load()
file = './data/GoogleLocation2018.json'
with open(file) as train_file:
    data_json = json.load(train_file)
    
df = pd.DataFrame({'raw': data_json['locations']})

In [None]:
# Extract the base data
df['timestampMs'] = df.apply(lambda row: int(row.raw['timestampMs']), axis=1)
df['latitudeE7'] = df.apply(lambda row: row.raw['latitudeE7'], axis=1)
df['longitudeE7'] = df.apply(lambda row: row.raw['longitudeE7'], axis=1)
df['accuracy'] = df.apply(lambda row: row.raw['accuracy'], axis=1)

# Derivatives
df['date'] = pd.to_datetime(df.timestampMs, unit='ms')
df['year'] = df.apply(lambda row: row.date.year, axis=1)
df['month'] = df.apply(lambda row: row.date.month, axis=1)
df['day'] = df.apply(lambda row: row.date.day, axis=1)
df['hour'] = df.apply(lambda row: row.date.hour, axis=1)

df['time_of_day'] = (df.timestampMs % (1000*60*60*24)) / (1000*60*60)

### TODO add midnight data points for better day accuracy

In [None]:
# Remove low accuracy data points
df = df[(df.accuracy <= 100)]

# Duration dependent on the removed data points
df['duration'] = (df.timestampMs.shift(1) - df.timestampMs.shift(-1))/ 2 / (1000*60*60)

In [None]:
df.head()

In [None]:
df.groupby(['year', 'month', 'day']).agg({'duration': ['sum']})

In [None]:
min_x_kunlabora = 47_126_000
max_x_kunlabora = 47_140_000
min_y_kunlabora = 508_831_000
max_y_kunlabora = 508_838_000

def at_kunlabora(row):
    selection_long = (row['longitudeE7']>min_x_kunlabora) & (row['longitudeE7']<max_x_kunlabora)
    selection_lat = (row['latitudeE7']>min_y_kunlabora) & (row['latitudeE7']<max_y_kunlabora)
    return selection_long & selection_lat

df['at_kunlabora'] = df.apply(lambda row: at_kunlabora(row), axis=1)

plt.figure(figsize=(16, 10))
_,_,_,img = plt.hist2d(x=df[df['at_kunlabora']==1].longitudeE7, y=df[df['at_kunlabora']==1].latitudeE7, bins=25, norm=mpl.colors.LogNorm())

In [None]:
df[df.at_kunlabora].groupby(['year', 'month', 'day', 'at_kunlabora']).agg({'duration': ['sum']}).hist(figsize=(16, 10))