In [1]:
import os
os.getcwd()
os.chdir("G:/OneDrive/casa0010dissertation/00_06 jaipur code_mgwr")

In [2]:
import geopandas as gpd
from exactextract import exact_extract
import rasterio
import pandas as pd

In [3]:
# Load my hex grid
hex_grid = gpd.read_file('data/raw/pca/jaipur_hex500_core_0.5.geojson')

# Fix id column and rename to hex_id
hex_grid = hex_grid.rename(columns={"id": "hex_id"})
hex_grid["hex_id"] = pd.to_numeric(hex_grid["hex_id"], errors="raise").astype("int32")

# Ensure coverage ratio exists
if "cover" not in hex_grid.columns:
    hex_grid["cover"] = 1.0

print(f"Loaded {len(hex_grid)} hexagonal cells.")
print(f"hex_id column type: {hex_grid['hex_id'].dtype}")
print(f"Cover column exists: {'cover' in hex_grid.columns}")
print(hex_grid[['hex_id', 'cover']].head())

Loaded 1799 hexagonal cells.
hex_id column type: int32
Cover column exists: True
   hex_id     cover
0      23  0.925903
1      24  0.991823
2      25  0.998568
3      26  0.984131
4      27  0.794481


In [4]:
print(f"Loaded {len(hex_grid)} hexagonal cells.")

Loaded 1799 hexagonal cells.


In [5]:
hex_grid.columns

Index(['hex_id', 'left', 'top', 'right', 'bottom', 'row_index', 'col_index',
       'area_cell', 'area_clip', 'cover', 'geometry'],
      dtype='object')

# 1 Zonal Statistics Calculation for Raster Data

In [6]:
# Define paths to my raster files
pop_raster = 'population_and_id_100m_mgwr04.tif'
builtup_raster = 'data/raw/pca/Jaipur_GHSL_built_surface_2020.tif'
ndvi_raster = 'data/raw/pca/Jaipur_NDVI_mean_2024.tif'
ntl_raster = 'data/raw/pca/Jaipur_VIIRS_DNB_average_2023_2024.tif'

## 1.1 Population Raster

In [7]:
# pop use sum
pop_results = exact_extract(pop_raster, hex_grid, 'sum', include_cols='hex_id', output='pandas')
pop_results = pop_results[['hex_id', 'band_1_sum']]
pop_results.rename(columns={'band_1_sum': 'pop_sum'}, inplace=True)
print("Population results:")
print(pop_results.head())



Population results:
   hex_id     pop_sum
0      23  101.298784
1      24  344.787011
2      25   46.793371
3      26   99.160115
4      27  128.000000


## 1.2 Built-up Area Raster

In [8]:
# Open the built-up area raster file
with rasterio.open(builtup_raster) as src:
    # Display basic metadata
    print(f"Driver: {src.driver}")
    print(f"Width: {src.width}, Height: {src.height}")
    print(f"Number of bands: {src.count}")
    print(f"CRS: {src.crs}")
    print(f"Transform: {src.transform}")
    print(f"Data type: {src.dtypes}")

    # Display band-specific information
    for i in range(1, src.count + 1):
        band = src.read(i)
        print(f"\nBand {i} statistics:")
        print(f"  Min: {band.min()}")
        print(f"  Max: {band.max()}")
        print(f"  Mean: {band.mean()}")
        print(f"  No data value: {src.nodatavals[i-1]}")

        # Get band description if available
        try:
            band_desc = src.descriptions[i-1]
            if band_desc:
                print(f"  Description: {band_desc}")
        except:
            pass

Driver: GTiff
Width: 223, Height: 282
Number of bands: 1
CRS: EPSG:32643
Transform: | 100.00, 0.00, 568300.00|
| 0.00,-100.00, 2989600.00|
| 0.00, 0.00, 1.00|
Data type: ('uint16',)

Band 1 statistics:
  Min: 0
  Max: 9791
  Mean: 1704.946188340807
  No data value: None
  Description: built_surface


In [9]:
builtup_results = exact_extract(builtup_raster, hex_grid, 'sum', include_cols='hex_id', output='pandas')
builtup_results.rename(columns={'sum': 'builtup_sum'}, inplace=True)
print("\nBuilt-up results:")
print(builtup_results.head())


Built-up results:
   hex_id   builtup_sum
0      23  24964.369187
1      24  33964.572892
2      25   6843.779378
3      26  14021.257153
4      27  15053.141378


## 1.3 NDVI Raster

In [10]:
# Open the NDVI raster file
with rasterio.open(ndvi_raster) as src:
    # Display basic metadata
    print(f"Driver: {src.driver}")
    print(f"Width: {src.width}, Height: {src.height}")
    print(f"Number of bands: {src.count}")
    print(f"CRS: {src.crs}")
    print(f"Transform: {src.transform}")
    print(f"Data type: {src.dtypes}")

    # Display band-specific information
    for i in range(1, src.count + 1):
        band = src.read(i)
        print(f"\nBand {i} statistics:")
        print(f"  Min: {band.min()}")
        print(f"  Max: {band.max()}")
        print(f"  Mean: {band.mean()}")
        print(f"  No data value: {src.nodatavals[i-1]}")

        # Get band description if available
        try:
            band_desc = src.descriptions[i-1]
            if band_desc:
                print(f"  Description: {band_desc}")
        except:
            pass

Driver: GTiff
Width: 223, Height: 282
Number of bands: 1
CRS: EPSG:32643
Transform: | 100.00, 0.00, 568300.00|
| 0.00,-100.00, 2989600.00|
| 0.00, 0.00, 1.00|
Data type: ('float32',)

Band 1 statistics:
  Min: nan
  Max: nan
  Mean: nan
  No data value: None
  Description: NDVI


In [11]:
# NDVI use mean
ndvi_results = exact_extract(ndvi_raster, hex_grid, 'mean', include_cols='hex_id', output='pandas')
ndvi_results.rename(columns={'mean': 'ndvi_mean'}, inplace=True)
print("\nNDVI results:")
print(ndvi_results.head())


NDVI results:
   hex_id  ndvi_mean
0      23   0.304012
1      24   0.288126
2      25   0.312536
3      26   0.332531
4      27   0.311168


# 1.4 Nighttime Lights Raster

In [12]:
# Open the ntl area raster file
with rasterio.open(ntl_raster) as src:
    # Display basic metadata
    print(f"Driver: {src.driver}")
    print(f"Width: {src.width}, Height: {src.height}")
    print(f"Number of bands: {src.count}")
    print(f"CRS: {src.crs}")
    print(f"Transform: {src.transform}")
    print(f"Data type: {src.dtypes}")

    # Display band-specific information
    for i in range(1, src.count + 1):
        band = src.read(i)
        print(f"\nBand {i} statistics:")
        print(f"  Min: {band.min()}")
        print(f"  Max: {band.max()}")
        print(f"  Mean: {band.mean()}")
        print(f"  No data value: {src.nodatavals[i-1]}")

        # Get band description if available
        try:
            band_desc = src.descriptions[i-1]
            if band_desc:
                print(f"  Description: {band_desc}")
        except:
            pass

Driver: GTiff
Width: 223, Height: 282
Number of bands: 1
CRS: EPSG:32643
Transform: | 100.00, 0.00, 568300.00|
| 0.00,-100.00, 2989600.00|
| 0.00, 0.00, 1.00|
Data type: ('float32',)

Band 1 statistics:
  Min: nan
  Max: nan
  Mean: nan
  No data value: None
  Description: average


In [13]:
ntl_results = exact_extract(ntl_raster, hex_grid, 'mean', include_cols='hex_id', output='pandas')
ntl_results.rename(columns={'mean': 'ntl_mean'}, inplace=True)
print("\nNTL results:")
print(ntl_results.head())


NTL results:
   hex_id   ntl_mean
0      23  17.489318
1      24  20.017120
2      25  14.466437
3      26  13.698259
4      27  16.072894


# 1.5 Combine Results into a Single DataFrame

In [14]:
# Merge all results into a single DataFrame called raster_results
raster_results = pop_results.merge(builtup_results, on='hex_id', how='outer') \
                             .merge(ndvi_results, on='hex_id', how='outer') \
                             .merge(ntl_results, on='hex_id', how='outer')


raster_results['hex_id'] = raster_results['hex_id'].astype('int32')

# sort the results by hex_id
raster_results = raster_results.sort_values(by='hex_id').reset_index(drop=True)


print("Merged raster results:")
print(raster_results.head())
print(f"Total records: {len(raster_results)}")



Merged raster results:
   hex_id     pop_sum   builtup_sum  ndvi_mean   ntl_mean
0      23  101.298784  24964.369187   0.304012  17.489318
1      24  344.787011  33964.572892   0.288126  20.017120
2      25   46.793371   6843.779378   0.312536  14.466437
3      26   99.160115  14021.257153   0.332531  13.698259
4      27  128.000000  15053.141378   0.311168  16.072894
Total records: 1799


In [15]:
raster_results['pop_sum'].describe()

count     1799.000000
mean      1687.619662
std       2285.142491
min          0.000000
25%        306.405941
50%        954.198131
75%       2130.758175
max      20418.297154
Name: pop_sum, dtype: float64

In [16]:
# merge hex_grid to get geometry and cover
hex_minimal = hex_grid[['hex_id','area_clip', 'cover', 'geometry']].copy()

# merge raster and hex grid
hex_raster_final = hex_minimal.merge(raster_results, on='hex_id', how='left')


In [17]:
# Calculate population density and built-up area density
# Note: Not using cover adjustment, directly using pop_sum and builtup_sum (obtained through exact_extract using 'sum')
# Population density (people/km2)
hex_raster_final['pop_density_km2'] = hex_raster_final['pop_sum'] / (hex_raster_final['area_clip'] / 1000000)
# Built-up area density (built-up area ratio, between 0-1)
hex_raster_final['builtup_density'] = hex_raster_final['builtup_sum'] / (hex_raster_final['area_clip'] )

# Handle potential division by zero
import numpy as np
hex_raster_final['pop_density_km2'] = np.where(
    hex_raster_final['area_clip'] > 0,
    hex_raster_final['pop_density_km2'],
    0
)

hex_raster_final['builtup_density'] = np.where(
    hex_raster_final['area_clip'] > 0,
    hex_raster_final['builtup_density'],
    0
)

# For NDVI and NTL, no additional processing needed
# Their values are already means within the city boundary


print(f"\nFinal dataset shape: {hex_raster_final.shape}")
print("Columns:", list(hex_raster_final.columns))


Final dataset shape: (1799, 10)
Columns: ['hex_id', 'area_clip', 'cover', 'geometry', 'pop_sum', 'builtup_sum', 'ndvi_mean', 'ntl_mean', 'pop_density_km2', 'builtup_density']


In [18]:
hex_raster_final['pop_density_km2'].describe()

count     1799.000000
mean      7831.516479
std      10543.329464
min          0.000000
25%       1457.915917
50%       4491.935024
75%       9847.504207
max      94308.074868
Name: pop_density_km2, dtype: float64

In [19]:
# make NaN as 0
raster_vars = ['pop_density_km2', 'builtup_density', 'ndvi_mean', 'ntl_mean']
hex_raster_final[raster_vars] = hex_raster_final[raster_vars].fillna(0)


if hex_raster_final.crs != 'EPSG:32643':
    hex_raster_final = hex_raster_final.to_crs(32643)

print("=== Final Raster Dataset Summary ===")
print(f"Shape: {hex_raster_final.shape}")
print(f"CRS: {hex_raster_final.crs}")
print(f"Columns: {list(hex_raster_final.columns)}")


print("\n📈 Raster Variable Summary:")
for var in raster_vars:
    mean = hex_raster_final[var].mean()
    non_zero = (hex_raster_final[var] > 0).sum()
    print(f"  {var}: mean={mean:.1f}, non-zero hexes={non_zero}")

=== Final Raster Dataset Summary ===
Shape: (1799, 10)
CRS: EPSG:32643
Columns: ['hex_id', 'area_clip', 'cover', 'geometry', 'pop_sum', 'builtup_sum', 'ndvi_mean', 'ntl_mean', 'pop_density_km2', 'builtup_density']

📈 Raster Variable Summary:
  pop_density_km2: mean=7831.5, non-zero hexes=1633
  builtup_density: mean=0.3, non-zero hexes=1727
  ndvi_mean: mean=0.3, non-zero hexes=1799
  ntl_mean: mean=25.0, non-zero hexes=1799


In [20]:
# export GeoParquet
import pathlib
output_path = pathlib.Path("data/cleaned/pca/jaipur_hex_rasters.parquet")
output_path.parent.mkdir(parents=True, exist_ok=True)

hex_raster_final.to_parquet(output_path, index=False)

print(f"\n✓ Raster aggregation complete → {output_path}")
print(f"📊 Final dataset: {len(hex_raster_final)} hexes with {len(raster_vars)} raster variables")


✓ Raster aggregation complete → data\cleaned\pca\jaipur_hex_rasters.parquet
📊 Final dataset: 1799 hexes with 4 raster variables
