In [None]:
# Source: https://osdatahub.os.uk/downloads/open/OpenRoads

In [None]:
from os import listdir, path

import shapely
import pandas as pd
import geopandas as gpd

from lets_plot import *
from lets_plot.geo_data import *

In [None]:
LetsPlot.setup_html()

In [None]:
INPUT_DIR = "data"

In [None]:
def data_files(input_dir):
    return {
        filename[:2]: path.join(input_dir, filename)
        for filename in listdir(input_dir)
        if filename[3:] == "RoadLink.shp"
    }

def load_dataset(fullpath):
    return gpd.read_file(fullpath)[["length", "geometry"]].to_crs(4326)

def prepare_dataset(fullpath):
    gdf = load_dataset(fullpath)
    return pd.DataFrame([
        {'id': i, 'lon': p[0], 'lat': p[1], 'length': row['length']}
        for i, row in gdf.to_dict(orient='index').items()
        for p in zip(*row['geometry'].xy)
    ])

def add_prefix_to_id(df, prefix):
    df['id'] = prefix + "-" + df['id'].astype(str)
    return df

def get_df(input_dir, input_slice=None):
    if input_slice is None:
        data_files_slice = data_files(input_dir)
    else:
        data_files_slice = dict(list(data_files(input_dir).items())[input_slice])
    return pd.concat([
        add_prefix_to_id(prepare_dataset(fullpath), region_id)
        for region_id, fullpath in data_files_slice.items()
    ])

In [None]:
'''
N = 52
times = []
for i in range(N):
    print("\n{0}:".format(i))
    t = %timeit -n 2 -r 3 -o get_df(INPUT_DIR, slice(i, i + 1))
    a = t.average
    if len(times) == 0 or a > max(times):
        print("(longest)")
    times.append(a)
'''

In [None]:
df = get_df(INPUT_DIR, slice(0, 1))
print(df.shape)
df.head()

In [None]:
ggplot(df, aes("lon", "lat")) + \
    geom_path(aes(group="id", color="length"))

---

In [None]:
'''
gdf = gpd.read_file("data/HP_RoadLink.shp")[["length", "geometry"]].to_crs(4326)
print(gdf.shape)
gdf.head()
'''

In [None]:
'''
df = pd.DataFrame([
    {'id': i, 'lon': p[0], 'lat': p[1], 'length': row['length']}
    for i, row in gdf.to_dict(orient='index').items()
    for p in zip(*row['geometry'].xy)
])
print(df.shape)
df.head()
'''

In [None]:
'''
ggplot(df, aes("lon", "lat")) + \
    geom_path(aes(group="id", color="length")) + \
    scale_color_grey()
'''

---

In [None]:
'''
gdf = gpd.read_file("data/HP_RoadLink.shp")[["length", "geometry"]].to_crs(4326)
gdf["geometry"] = gdf["geometry"].apply(lambda v: shapely.wkb.loads(shapely.wkb.dumps(v, output_dimension=2)))
print(gdf.shape)
gdf.head()
'''

In [None]:
'''
ggplot() + \
    geom_path(aes(color="length"), data=gdf)
'''

---

In [None]:
def data_files(input_dir):
    return {
        filename[:2]: path.join(input_dir, filename)
        for filename in listdir(input_dir)
        if filename[3:] == "RoadLink.shp"
    }

def load_dataset(fullpath, outer_geom=None):
    gdf = gpd.read_file(fullpath)
    gdf = gdf[["geometry", "length"]] # remove extra columns
    gdf = gdf.to_crs(4326) # transform coordinates to EPSG:4326
    if outer_geom is not None:
        gdf = gdf[gdf["geometry"].intersects(outer_geom)] # restrict paths by outer geometry
    gdf["geometry"] = gdf["geometry"].apply(
        lambda v: shapely.wkb.loads(shapely.wkb.dumps(v, output_dimension=2))
    ) # remove extra third coordinate (that is always 0)
    return gdf

def get_gdf(input_dir, *, outer_geom=None, input_slice=None):
    if input_slice is None:
        data_files_slice = data_files(input_dir)
    else:
        data_files_slice = {k: v for k, v in data_files(input_dir).items() if k in input_slice}
    return gpd.GeoDataFrame(pd.concat([
        load_dataset(fullpath, outer_geom).assign(id=file_id)
        for file_id, fullpath in data_files_slice.items()
    ], ignore_index=True))

In [None]:
london_gdf = geocode_counties("London").get_boundaries()
london_gdf

In [None]:
roads_gdf = get_gdf(INPUT_DIR, outer_geom=london_gdf.iloc[0].geometry, input_slice=["TL", "TQ"])
print(roads_gdf.shape)
roads_gdf.head()

In [None]:
ggplot() + \
    geom_map(map=london_gdf) + \
    geom_path(map=roads_gdf.iloc[:10])