In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pandas as pd
import geopandas as gpd
from tqdm.auto import tqdm
from src.settings import *
from src.tools.osmnx_utils import get_place_dir_name
import json5 as json
from functional import seq

tqdm.pandas()

In [3]:
with open(RAW_DATA_DIR / "featureset_transformation_default.jsonc", "r") as f:
    featureset = json.load(f)
    features = list(featureset.keys())

In [4]:
cities = pd.read_csv(RAW_DATA_DIR / "cities.csv")
# cities = cities[(cities.kacper)]
# cities = cities[(cities["country"] == "Poland")]
# cities = cities[(cities["city"] == "Wrocław")]

cities

Unnamed: 0,city,country,continent,kacper,szymon,piotr,kamil,regions,to_fix
0,Tokyo,Japan,Asia,False,False,False,False,,True
1,Nur-Sultan,Kazakhstan,Asia,True,True,False,False,,False
2,Doha,Qatar,Asia,True,False,False,False,,True
3,Moscow,Russia,Asia,True,True,False,True,,False
4,St. Petersburg,Russia,Asia,True,False,False,False,,False
...,...,...,...,...,...,...,...,...,...
112,New York City,United States,North America,True,True,False,False,,True
113,Philadelphia,United States,North America,True,True,False,False,,True
114,San Francisco,United States,North America,True,True,False,False,,True
115,Washington D.C.,United States,North America,False,False,False,False,,True


In [5]:
network_type = "drive"

pbar = tqdm(cities.itertuples(), total=cities.shape[0])
edges_cities = []
for row in pbar:
    place_name = f"{row.city},{row.country}"
    place_dir_name = get_place_dir_name(place_name)
    pbar.set_description(place_name)

    try:
        edges_city = gpd.read_file(GENERATED_DATA_DIR / place_dir_name / f"graph_{network_type}.gpkg", layer="edges")
        edges_city["city"] = row.city
        edges_city["country"] = row.country
        edges_city["continent"] = row.continent
        edges_cities.append(edges_city)
    except Exception as e:
        print("\nFailed", place_name, e)
    
columns_superset = seq(edges_cities).map(lambda edges_city: set(edges_city.columns)).reduce(lambda a, b: a.union(b))
columns_superset

for edges_city in edges_cities:
    edges_city_cols = set(edges_city.columns)
    missing_columns = list(columns_superset - edges_city_cols)
    edges_city[missing_columns] = None

edges = pd.concat(edges_cities, ignore_index=True)
del edges_city, edges_cities

Sydney,Australia: 100%|██████████| 117/117 [02:41<00:00,  1.38s/it]              


In [6]:
edges.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2849088 entries, 0 to 2849087
Data columns (total 30 columns):
 #   Column     Dtype   
---  ------     -----   
 0   u          int64   
 1   v          int64   
 2   key        int64   
 3   osmid      object  
 4   highway    object  
 5   oneway     object  
 6   length     float64 
 7   from       int64   
 8   to         int64   
 9   ref        object  
 10  name       object  
 11  bridge     object  
 12  tunnel     object  
 13  access     object  
 14  bicycle    object  
 15  maxspeed   object  
 16  lanes      object  
 17  surface    object  
 18  width      object  
 19  geometry   geometry
 20  city       object  
 21  country    object  
 22  continent  object  
 23  est_width  object  
 24  area       object  
 25  landuse    object  
 26  junction   object  
 27  service    object  
 28  lit        object  
 29  footway    object  
dtypes: float64(1), geometry(1), int64(5), object(23)
memory usage: 652.1+ MB


In [7]:
edges.columns

Index(['u', 'v', 'key', 'osmid', 'highway', 'oneway', 'length', 'from', 'to',
       'ref', 'name', 'bridge', 'tunnel', 'access', 'bicycle', 'maxspeed',
       'lanes', 'surface', 'width', 'geometry', 'city', 'country', 'continent',
       'est_width', 'area', 'landuse', 'junction', 'service', 'lit',
       'footway'],
      dtype='object')

In [8]:
from itertools import chain
from src.tools.feature_extraction import sanitize, normalize, sanitize_and_normalize
from IPython.display import clear_output
cols_bloat = set(['u', 'v', 'geometry', 'osmid', 'from', 'to', 'ref', 'name', 'key', 'city', 'country', 'continent'])

path = RAW_DATA_DIR / "features_counts"
path.mkdir(exist_ok=True, parents=True)

pbar = tqdm((set(columns_superset) - cols_bloat))
for f in pbar:
    pbar.set_description(f)

    edges_list = edges[f].apply(lambda x: eval(str(x)) if "[" in str(x) else [str(x)])
    sanitized = edges_list.apply(lambda x: [sanitize(y, f) for y in x])
    normalized = sanitized.apply(lambda x: [normalize(y, f) for y in x])

    counts = edges_list.explode().value_counts().sort_values(ascending=False)
    counts_sanitized = sanitized.explode().value_counts().sort_values(ascending=False)
    counts_normalized = normalized.explode().value_counts().sort_values(ascending=False)

    clear_output(wait=True)

    # edges_list.to_csv(path / f"{f}.csv")
    # sanitized.to_csv(path / f"{f}_sanitized.csv")
    # normalized.to_csv(path / f"{f}_normalized.csv")

    counts.to_csv(path / f"{f}_counts.csv")
    counts_sanitized.to_csv(path / f"{f}_sanitized_counts.csv")
    counts_normalized.to_csv(path / f"{f}_normalized_counts.csv")
    
    

access: 100%|██████████| 18/18 [03:02<00:00, 10.13s/it]
