In [None]:
import json
import pandas as pd
import math
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpu import haversine_distance as mpu_distance

%matplotlib inline

In [None]:
# reading the JSON data using json.load()
file = './data/GoogleLocation2018.json'
with open(file) as train_file:
    data_json = json.load(train_file)
    
df = pd.DataFrame({'raw': data_json['locations']})

In [None]:
# Extract the base data
df['timestampMs'] = df.apply(lambda row: int(row.raw['timestampMs']), axis=1)
df['latitudeE7'] = df.apply(lambda row: row.raw['latitudeE7'] / 10_000_000, axis=1)
df['longitudeE7'] = df.apply(lambda row: row.raw['longitudeE7'] / 10_000_000, axis=1)
df['accuracy'] = df.apply(lambda row: row.raw['accuracy'], axis=1)

# Derivatives of: timestampMs
df['date'] = pd.to_datetime(df.timestampMs, unit='ms')
df['year'] = df.apply(lambda row: row.date.year, axis=1)
df['month'] = df.apply(lambda row: row.date.month, axis=1)
df['day'] = df.apply(lambda row: row.date.day, axis=1)
df['hour'] = df.apply(lambda row: row.date.hour, axis=1)

df['time_of_day'] = (df.timestampMs % (1000*60*60*24)) / (1000*60*60)
df['duration'] = (df.timestampMs.shift(1) - df.timestampMs.shift(-1))/ 2 / (1000*60*60)

In [None]:
# Derivatives of: latitudeE7 & longitudeE7
lat_min_belgium = 49.496309
lat_max_belgium = 51.506297
long_min_belgium = 2.542484
long_max_belgium = 6.408894
df['in_belgium'] = df.apply(lambda row: 
                            (row.latitudeE7 > lat_min_belgium) & (row.latitudeE7 < lat_max_belgium)
                            & (row.longitudeE7 > long_min_belgium) & (row.longitudeE7 < long_max_belgium)
                            , axis=1)

df['d_home']       = df.apply(lambda row: mpu_distance((50.8757, 4.6904), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_squash']     = df.apply(lambda row: mpu_distance((50.8677, 4.7113), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_curtis']     = df.apply(lambda row: mpu_distance((50.8808, 4.6933), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_saman_new']  = df.apply(lambda row: mpu_distance((50.8798, 4.6882), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_saman_old']  = df.apply(lambda row: mpu_distance((50.8831, 4.6900), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_kunlabora']  = df.apply(lambda row: mpu_distance((50.8836, 4.7133), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_cegeka']     = df.apply(lambda row: mpu_distance((50.8477, 4.7273), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_maisuradze'] = df.apply(lambda row: mpu_distance((50.8292, 4.4518), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_ensure']     = df.apply(lambda row: mpu_distance((50.8470, 4.3700), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_famifed']    = df.apply(lambda row: mpu_distance((50.8410, 4.3740), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_lannoo']     = df.apply(lambda row: mpu_distance((51.3207, 4.9262), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_valthorens'] = df.apply(lambda row: mpu_distance((45.2970, 6.5787), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_tbilisi']    = df.apply(lambda row: mpu_distance((41.4255, 44.456), (row.latitudeE7, row.longitudeE7)), axis=1)
df['d_bangkok']    = df.apply(lambda row: mpu_distance((13.7526, 100.50), (row.latitudeE7, row.longitudeE7)), axis=1)


In [None]:
df['basis'] = df[[col for col in df.columns if 'd_' in col]].idxmin(axis=1)
df['distance'] = df[[col for col in df.columns if 'd_' in col]].min(axis=1)

In [None]:
df[(df.in_belgium) & (df.distance < 0.1)] \
    .distance.hist(bins=100, figsize=(16, 10))

In [None]:
df.time_of_day.hist(bins=24*60, figsize=(16, 10))

In [None]:
df.plot(x='time_of_day', y='duration', alpha=0.2, kind='scatter', logy=True, figsize=(16, 10))

In [None]:
days = df.groupby(['year', 'month', 'day']).agg({
        'latitudeE7': ['median', 'var'],
        'longitudeE7': ['median', 'var']
    })

days.plot.box(logy=True, figsize=(16, 10))

In [None]:
plt.figure(figsize=(16, 10))
_,_,_,img = plt.hist2d(
    x=days.longitudeE7['median'], 
    y=days.latitudeE7['median'], 
    bins=100, 
    norm=mpl.colors.LogNorm()
)