## 1. Imports & Setup

In [None]:
import geopandas as gpd
import shapely
from shapely.geometry import shape
import fiona
import os

print("GeoPandas:", gpd.__version__)
print("Shapely:", shapely.__version__)


GeoPandas: 1.1.2
Shapely: 2.1.2


## 2. Load a clean Dubai boundary polygon

In [None]:
# Load UAE administrative boundaries (level 1 = emirates)
gadm_url = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_ARE_1.json"

uae = gpd.read_file(gadm_url)

# Filter Dubai emirate
dubai = uae[uae["NAME_1"] == "Dubai"].to_crs(4326)

dubai


Unnamed: 0,GID_1,GID_0,COUNTRY,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,ISO_1,geometry
2,ARE.3_1,ARE,UnitedArabEmirates,Dubai,,دبي,Emirate,Emirate,,AE.DU,,"MULTIPOLYGON (((56.1637 24.7648, 56.1127 24.73..."


## 3. Define paths

In [6]:
print(f"Current working directory: {os.getcwd()}")
print(f"Notebook location: {os.path.dirname(os.path.abspath('.'))}")

Current working directory: /home/aleksei/Projects/telecom_analytics_4.0/ingestion/notebooks
Notebook location: /home/aleksei/Projects/telecom_analytics_4.0/ingestion


In [7]:
data_path = "../../data/gcc-states-251227-free.shp/"

buildings_path = data_path + "gis_osm_buildings_a_free_1.shp"
roads_path     = data_path + "gis_osm_roads_free_1.shp"
pois_path      = data_path + "gis_osm_pois_a_free_1.shp"
landuse_path   = data_path + "gis_osm_landuse_a_free_1.shp"

output_dir = data_path + "filtered/"
os.makedirs(output_dir, exist_ok=True)


In [8]:
data = gpd.read_file(buildings_path).to_crs(epsg=4326)
print(data.head())
print(data.crs)

     osm_id  code    fclass                  name              type  \
0  10315513  1500  building           Concourse C  airport_terminal   
1  10316322  1500  building  Jumeirah Beach Hotel              None   
2  12700546  1500  building       برج العرب جميرا             hotel   
3  23978061  1500  building             بلدية دبي              None   
4  24897600  1500  building   Park House building              None   

                                            geometry  
0  POLYGON ((55.35051 25.25429, 55.35076 25.25467...  
1  POLYGON ((55.19005 25.14043, 55.19005 25.14055...  
2  POLYGON ((55.1848 25.14168, 55.18576 25.14146,...  
3  POLYGON ((55.31132 25.26445, 55.31168 25.2649,...  
4  POLYGON ((47.98943 29.37998, 47.98971 29.38011...  
EPSG:4326


## 4. Load OSM layers

In [9]:
buildings = gpd.read_file(buildings_path).to_crs(4326)
roads     = gpd.read_file(roads_path).to_crs(4326)
pois      = gpd.read_file(pois_path).to_crs(4326)
landuse   = gpd.read_file(landuse_path).to_crs(4326)

print(len(buildings), len(roads), len(pois), len(landuse))


1462714 1780573 45195 132340


## 5. Spatial filtering: keep only features intersecting Dubai

In [10]:
buildings_dubai = gpd.sjoin(buildings, dubai, predicate="intersects", how="inner")
roads_dubai     = gpd.sjoin(roads, dubai, predicate="intersects", how="inner")
pois_dubai      = gpd.sjoin(pois, dubai, predicate="intersects", how="inner")
landuse_dubai   = gpd.sjoin(landuse, dubai, predicate="intersects", how="inner")


## 6. Clean up join artifacts

In [11]:
def clean_join(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    cols_to_drop = [
        c for c in gdf.columns
        if isinstance(c, str) and c.startswith("index_")
    ]
    return gdf.drop(columns=cols_to_drop, errors="ignore")

buildings_dubai = clean_join(buildings_dubai)
roads_dubai     = clean_join(roads_dubai)
pois_dubai      = clean_join(pois_dubai)
landuse_dubai   = clean_join(landuse_dubai)


In [18]:
def normalize_columns(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    rename_map = {}
    for col in gdf.columns:
        if col.endswith("_left"):
            rename_map[col] = col.replace("_left", "")
        if col.endswith("_right"):
            rename_map[col] = col.replace("_right", "")
    return gdf.rename(columns=rename_map)

buildings_dubai = normalize_columns(buildings_dubai)
roads_dubai     = normalize_columns(roads_dubai)
pois_dubai      = normalize_columns(pois_dubai)
landuse_dubai   = normalize_columns(landuse_dubai)


## 7. Validate row counts

In [19]:
print("Buildings:", len(buildings_dubai))
print("Roads:", len(roads_dubai))
print("POIs:", len(pois_dubai))
print("Landuse:", len(landuse_dubai))


Buildings: 245188
Roads: 170505
POIs: 6690
Landuse: 9513


## 8. Save to GPKG

In [20]:
buildings_dubai.to_file(output_dir + "dubai_buildings.gpkg", driver="GPKG", layer="buildings")
roads_dubai.to_file(output_dir + "dubai_roads.gpkg", driver="GPKG", layer="roads")
pois_dubai.to_file(output_dir + "dubai_pois.gpkg", driver="GPKG", layer="pois")
landuse_dubai.to_file(output_dir + "dubai_landuse.gpkg", driver="GPKG", layer="landuse")


## 9. Sanity check: load back the saved files

In [21]:
print(len(gpd.read_file(output_dir + "dubai_buildings.gpkg")))
print(len(gpd.read_file(output_dir + "dubai_roads.gpkg")))
print(len(gpd.read_file(output_dir + "dubai_pois.gpkg")))
print(len(gpd.read_file(output_dir + "dubai_landuse.gpkg")))


245188
170505
6690
9513


In [22]:
# Inspect all layers inside the GeoPackage
print(fiona.listlayers(output_dir + "dubai_buildings.gpkg"))
print(fiona.listlayers(output_dir + "dubai_roads.gpkg"))
print(fiona.listlayers(output_dir + "dubai_pois.gpkg"))
print(fiona.listlayers(output_dir + "dubai_landuse.gpkg"))


['buildings']
['roads']
['pois']
['landuse']
