In [None]:
import json
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
gpd.datasets.get_path('naturalearth_lowres')

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.drop(columns=['pop_est', 'gdp_md_est'], inplace=True)

In [None]:
world.plot(cmap='Set3', figsize=(10, 6))

In [None]:
# reading the JSON data using json.load()
file = './data/GoogleLocation2018.json'
with open(file) as train_file:
    data_json = json.load(train_file)
    
df = pd.DataFrame({'raw': data_json['locations']})

In [None]:
# Extract the base data
df['timeMs'] = df.apply(lambda row: int(row.raw['timestampMs']), axis=1)
df['lat'] = df.apply(lambda row: row.raw['latitudeE7'] / 10_000_000, axis=1)
df['lon'] = df.apply(lambda row: row.raw['longitudeE7'] / 10_000_000, axis=1)
df['acc'] = df.apply(lambda row: row.raw['accuracy'], axis=1)

df.drop(columns='raw', inplace=True)
# Derivatives
df['date'] = pd.to_datetime(df.timeMs, unit='ms')
df['year'] = df.apply(lambda row: row.date.year, axis=1)
df['month'] = df.apply(lambda row: row.date.month, axis=1)
df['day'] = df.apply(lambda row: row.date.day, axis=1)
df['duration'] = (df.timeMs.shift(1) - df.timeMs.shift(-1))/ 2 / (1000*60*60)

In [None]:
geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
gdf = GeoDataFrame(df, geometry=geometry)

# Setting the geometry by hand: http://geopandas.org/projections.html
gdf.crs = {'init' :'epsg:4326'}

In [None]:
gdf.plot(ax=world.plot(facecolor='lightgray', edgecolor='gray', figsize=(10, 6)),
             marker='o', color='red', markersize=15);

bounds = gdf.geometry.bounds

plt.xlim([bounds.minx.min()-5, bounds.maxx.max()+5])
plt.ylim([bounds.miny.min()-5, bounds.maxy.max()+5]);

In [None]:
gdf.plot(ax=world.plot(facecolor='lightgray', edgecolor='gray', figsize=(10, 6)),
             marker='o', color='red', markersize=15);

bounds = gdf[gdf.lon < 20].geometry.bounds

plt.xlim([bounds.minx.min()-2, bounds.maxx.max()+2])
plt.ylim([bounds.miny.min()-2, bounds.maxy.max()+2]);

In [None]:
gdf.plot(ax=world.plot(facecolor='lightgray', edgecolor='gray', figsize=(10, 6)),
             marker='o', color='red', markersize=15);

bounds = gdf[gdf.lon > 80].geometry.bounds

plt.xlim([bounds.minx.min()-2, bounds.maxx.max()+2])
plt.ylim([bounds.miny.min()-2, bounds.maxy.max()+2]);

In [None]:
gdf.plot(ax=world.plot(facecolor='lightgray', edgecolor='gray', figsize=(10, 6)),
             marker='o', color='red', markersize=15);

bounds = gdf[(gdf.lon > 30) & (gdf.lon < 45)].geometry.bounds

plt.xlim([bounds.minx.min()-2, bounds.maxx.max()+2])
plt.ylim([bounds.miny.min()-2, bounds.maxy.max()+2]);

In [None]:
gdf = gpd.sjoin(gdf, world, how='inner', op='intersects')

gdf.drop(columns='index_right', inplace=True)

gdf.rename(index=str, columns={'name': 'country'}, inplace=True)

In [None]:
country_summary = gdf.groupby('country').agg({'duration': 'sum'})

tot_duration = country_summary.duration.sum()

country_summary['%'] = country_summary.apply(lambda row: row.duration/tot_duration * 100, axis=1)

country_summary.sort_values('duration', ascending=False)

In [None]:
gdf.head()

In [None]:
gdf[gdf.country == 'Belgium'] \
    .plot(ax=world.plot(facecolor='lightgray', edgecolor='gray', figsize=(10, 6)),
             marker='o', color='red', markersize=15);

bounds = gdf[gdf.country == 'Belgium'].geometry.bounds

plt.xlim([bounds.minx.min()-2, bounds.maxx.max()+2])
plt.ylim([bounds.miny.min()-2, bounds.maxy.max()+2]);

In [None]:
mode = lambda x: x.mode() if len(x) > 2 else np.array(x)

gdf[gdf.country != 'Belgium'].groupby(['year', 'month', 'day']).country.agg(mode)