In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from src.settings import *
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as ctx
from keplergl import KeplerGl
import gzip
from src.tools.configs import DatasetGenerationConfig
from src.tools.feature_extraction import SpatialDataset
import pickle as pkl
from src.tools.feature_extraction import apply_feature_selection, apply_features_mapping, apply_features_assume
import seaborn as sns
from src.tools.vis_utils import visualize_kepler, save_config


tqdm.pandas()

In [None]:
dataset_filename="dataset_2022-11-01_11-39-37.pkl.gz"

ds_path = FEATURES_DIR / dataset_filename
with gzip.open(ds_path, "rb") as f:
    ds: SpatialDataset = pkl.load(f)

ds.__annotations__

In [None]:
ds_config = ds.config
cities = ds.cities
edges = ds.edges
edges_feature_selected = ds.edges_feature_selected
hexagons = ds.hexagons
hex_agg = ds.hex_agg
hex_agg_normalized = ds.hex_agg_normalized
random_seed = 42

In [None]:
# edges_selected = apply_feature_selection(edges, ds_config.featureset_selection)
# edges_selected = apply_features_mapping(edges_selected, [{
#     "feature": "highway",
#     "mapping": {
#         "motorway_link": "motorway",
#         "primary_link": "primary",
#         "secondary_link": "secondary",
#         "tertiary_link": "tertiary",
#         "trunk_link": "trunk"
#     }
# }])
# edges = apply_features_mapping(edges, [{
#     "feature": "surface",
#     "mapping": {
#         "asphalt": "paved",
#         "paving_stones": "paved",
#         "concrete": "paved",
#         "concrete_prefabricated": "paved",
#         "compacted": "unpaved",
#         "ground": "unpaved"
#     }
# }])
# # edges_selected = apply_features_mapping(edges_selected, [{
# #     "feature": "highway",
# #     "mapping": {
# #         'secondary': 'not_residential', 
# #         'tertiary': 'not_residential', 
# #         'living_street': 'not_residential', 
# #         'primary': 'not_residential', 
# #         'unclassified': 'not_residential', 
# #         'motorway': 'not_residential', 
# #         'trunk': 'not_residential'
# #     }
# # }])
# edges = apply_features_assume(edges, {
#       "surface": "paved",
# })
# edges_selected

In [None]:
feature_items = [(a, [f"{a}_{c}" for c in b]) for a, b in ds_config.featureset_selection["features"].items()]
edges_long = gpd.GeoDataFrame()
pbar = tqdm(feature_items)
for feature_name, features_superset in pbar:
    pbar.set_description(feature_name)
    features = list(set(edges_feature_selected.columns) & set(features_superset))
    try: 
        edges_long[feature_name] = edges_feature_selected[features].idxmax(axis=1).astype("category")
        edges_long[feature_name][edges_feature_selected[features].sum(axis=1) == 0] = None
    except Exception as e:
        print(e)
edges_long["geometry"] = edges["geometry"]
edges_long

In [None]:
config_name = "edges_raw"

edges_keplergl = edges_long.astype(str).reset_index().drop(columns=["h3_id"])
hexagons_keplergl = hexagons.reset_index().drop(columns=["coordinates", "parent", "children"])
hexagons_keplergl["h3_id"] = hexagons_keplergl["h3_id"].map(lambda x: f"hex_{x}")
m = visualize_kepler(data={
        "edges": edges_keplergl,
        "hexagons": hexagons_keplergl
    }, 
    config_name=config_name)
m

In [None]:
# save_config(m, config_name=config_name)

In [None]:
plt.style.use('seaborn-paper')

# Geo counts


In [None]:
df_index = edges.index.droplevel(3).unique().to_frame().reset_index(drop=True)

In [None]:
len(df_index)

In [None]:
df_continents = df_index.groupby("continent")["country"].count().sort_values()
ax = df_continents.plot(kind="barh", xlabel="Continent", ylabel="Count")
for i, v in enumerate(df_continents):
    ax.text(v, i, str(v), color='dimgray', va='center', fontweight='bold')
plt.tight_layout()
plt.savefig(FIGURES_DIR / "cities_in_continents_count.png")
ax

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 10))
df_countries = df_index.groupby("country")["city"].count().reset_index().sort_values(["city", "country"])
ax = df_countries.plot(kind="barh", x="country", y="city", xlabel="Country", ax=ax, legend=False)
# for i, v in enumerate(df_countries):
#     ax.text(v, i, " " + str(v), color='dimgray', va='center')
plt.xticks(ticks=np.arange(df_countries["city"].max() + 1))
plt.tight_layout()
plt.savefig(FIGURES_DIR / "cities_in_countries_count.png")
ax

# Edges

In [None]:
len(edges_long)

In [None]:
edges["u"].groupby(level=0).count().plot(kind="bar")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
edges["u"].groupby(level=1).count().plot(kind="bar", ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
# edges.xs("Poland", level=1, drop_level=False)["u"].groupby(level=2).count().plot(kind="bar", ax=ax)
edges["u"].groupby(level=2).count().plot(kind="bar", ax=ax)

# Hexagons


In [None]:
len(hexagons)

In [None]:
hexagons["parent"].groupby(level=0).count().plot(kind="bar")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
hexagons["parent"].groupby(level=1).count().plot(kind="bar", ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 18))
# hexagons.xs("Poland", level=1, drop_level=False)["parent"].groupby(level=2).count().plot(kind="bar", ax=ax)
hexagons_in_cities_count = hexagons["parent"].groupby(level=2).count()\
.sort_values()
hexagons_in_cities_count.plot(kind="barh", ax=ax, xlabel="City")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "hexagons_in_cities_count.png")

# Hex_agg


In [None]:
len(hex_agg)

In [None]:
hex_agg["oneway_True"].groupby(level=0).count().plot(kind="bar")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
hex_agg["oneway_True"].groupby(level=1).count().plot(kind="bar", ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
# hex_agg.xs("Poland", level=1, drop_level=False)["oneway_True"].groupby(level=2).count().plot(kind="bar", ax=ax)
hex_agg["oneway_True"].groupby(level=2).count().plot(kind="bar", ax=ax)

# Features

In [None]:
import plotly.io as pio
print(pio.templates)
pio.templates.default = "none"

In [None]:
n = len(edges_long)
edges_feature_count = pd.DataFrame(edges_long.drop(columns="geometry").notnull().sum().sort_values(ascending=False), columns=["count"])
edges_feature_share = round(100 * edges_feature_count / n, 2).rename(columns={"count": "Share"})
fig = px.bar(
    edges_feature_share.reset_index(),
    x="index",
    y="Share", 
    width=800,
    height=400, 
    title=f"Feature occurrence in {n} road segments",
    text="Share",
    color="Share",
    # color_continuous_scale="Turbo",
)
fig.update_layout(
    xaxis = dict(
        title = "Feature"
    ),
    yaxis = dict(
        title = "Share",
        range = [-1, 110]
    ),
    margin=dict(l=50, r=50, t=50, b=50),
    showlegend = False   
)
fig.update_traces( textposition='outside')
fig.show()
fig.write_image(FIGURES_DIR / "feature_shares.jpg")

In [None]:
for feature_name, features in feature_items:
    feature_counts = pd.DataFrame(edges_long[feature_name].value_counts().reindex(features, fill_value=0))
    feature_shares = round(100 * feature_counts / feature_counts.sum(), 2).rename(columns={feature_name: "Share"})
    feature_shares = feature_shares.set_index(feature_shares.index.map(lambda x: x.split("_", 1)[1]))
    fig = px.bar(
        feature_shares.reset_index(), 
        x="index",
        y="Share", 
        text="Share",
        color="Share",
        width=800,
        height=400, 
        title=(f"{feature_name.capitalize()} (n = {feature_counts[feature_name].sum()})"), 
        log_y=True,
    )
    fig.update_layout(
        xaxis = dict(
            # tickmode = 'linear',
            type="category",
            title = feature_name
        ),
        yaxis = dict(
            title = "Share",
            range = [-2, 2.3]
        ),
        margin=dict(l=50, r=50, t=50, b=100),
        showlegend=False
    )
    fig.update_traces( textposition='outside')
    fig.write_image(FIGURES_DIR / f"feature_{feature_name}_shares.jpg")
    fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 20))
sorted_columns_by_notnull_count = list(edges_long.drop(columns=["geometry"]).notnull().sum().sort_values(ascending=False).index)
city_feature_occurance = edges_long.groupby(level=[0, 1, 2]).count().div(edges_long.reset_index(level=3)["h3_id"].groupby(level=[0, 1, 2]).count(), axis=0) * 100
city_feature_occurance = city_feature_occurance.droplevel(level=0).sort_index().drop(columns="geometry").reindex(columns=sorted_columns_by_notnull_count)
sns.heatmap(data=city_feature_occurance, ax=ax, cmap="coolwarm", cbar_kws={"shrink": 0.3})
plt.ylabel("Country-City")
plt.xlabel("Feature")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "city_feature_occurance.svg")