# PM2.5 Hexagon Data Enrichment

This notebook creates a unified dataset by:
1. Identifying all PM2.5 measurement hexagons at H3 resolution 7
2. Enriching them with traffic and weather data (local or nearest-neighbor)
3. Adding static features like terrain elevation
4. Creating a query interface for location-based lookups

## Phase 1: Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import h3
import json
from datetime import datetime, timedelta
from tqdm import tqdm
import folium
from folium import plugins
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("Libraries loaded successfully")

Libraries loaded successfully


In [2]:
# Define data paths
DATA_PATH = '/Users/vojtech/Code/Bard89/Project-Data/data/processed/'

# H3 resolution for analysis
H3_RESOLUTION = 7  # 5.16 km² hexagons

print(f"Data path: {DATA_PATH}")
print(f"H3 Resolution: {H3_RESOLUTION} (approx 5.16 km² per hexagon)")

Data path: /Users/vojtech/Code/Bard89/Project-Data/data/processed/
H3 Resolution: 7 (approx 5.16 km² per hexagon)


## Phase 2: Create PM2.5 Hexagon Registry

In [3]:
# Load PM2.5 data
print("Loading PM2.5 air quality data...")
df_pm25 = pd.read_csv(f"{DATA_PATH}jp_openaq_processed_20230101_to_20231231.csv",
                       usecols=['h3_index_res8', 'h3_lat_res8', 'h3_lon_res8', 
                                'timestamp', 'pm25_ugm3_mean'])

print(f"Loaded {len(df_pm25):,} PM2.5 records")
print(f"Date range: {df_pm25['timestamp'].min()} to {df_pm25['timestamp'].max()}")
print(f"PM2.5 missing values: {df_pm25['pm25_ugm3_mean'].isna().mean():.1%}")

Loading PM2.5 air quality data...
Loaded 9,579,181 PM2.5 records
Date range: 2023-07-14 16:00:00+00:00 to 2025-07-26 05:00:00+00:00
PM2.5 missing values: 20.0%


In [4]:
# Create PM2.5 hexagon registry at resolution 7
print("\nCreating PM2.5 hexagon registry at H3 resolution 7...")

pm25_registry = {}

for _, row in tqdm(df_pm25[['h3_index_res8', 'h3_lat_res8', 'h3_lon_res8']].drop_duplicates().iterrows(), 
                   total=df_pm25['h3_index_res8'].nunique()):
    if pd.notna(row['h3_index_res8']):
        hex7 = h3.cell_to_parent(row['h3_index_res8'], H3_RESOLUTION)
        
        if hex7 not in pm25_registry:
            pm25_registry[hex7] = {
                'hex7_id': hex7,
                'center_lat': row['h3_lat_res8'],
                'center_lon': row['h3_lon_res8'],
                'res8_hexagons': [],
                'measurement_count': 0
            }
        
        pm25_registry[hex7]['res8_hexagons'].append(row['h3_index_res8'])

# Calculate measurement counts
hex7_counts = df_pm25.groupby(
    df_pm25['h3_index_res8'].apply(lambda x: h3.cell_to_parent(x, H3_RESOLUTION) if pd.notna(x) else None)
)['pm25_ugm3_mean'].count()

for hex7, count in hex7_counts.items():
    if hex7 in pm25_registry:
        pm25_registry[hex7]['measurement_count'] = count

print(f"\nCreated registry with {len(pm25_registry)} PM2.5 hexagons at resolution 7")

# Convert to DataFrame
pm25_hex_df = pd.DataFrame(pm25_registry.values())
pm25_hex_df = pm25_hex_df.sort_values('measurement_count', ascending=False)

print("\nTop 5 hexagons by measurement count:")
print(pm25_hex_df[['hex7_id', 'center_lat', 'center_lon', 'measurement_count']].head())


Creating PM2.5 hexagon registry at H3 resolution 7...


100%|██████████| 643/643 [00:00<00:00, 67847.48it/s]



Created registry with 634 PM2.5 hexagons at resolution 7

Top 5 hexagons by measurement count:
             hex7_id  center_lat  center_lon  measurement_count
466  872f5bc81ffffff      35.675     139.440              30977
427  872f5aaccffffff      35.607     139.722              30835
193  872e61ae1ffffff      34.401     135.304              22810
464  872f5bc24ffffff      35.465     139.470              18604
595  874b65d09ffffff      33.958     131.948              15910


## Phase 3: Load and Map Auxiliary Data

In [5]:
# Load traffic data
print("Loading traffic data...")
df_traffic = pd.read_csv(f"{DATA_PATH}jp_jartic_processed_20230101_to_20231231.csv",
                          usecols=['h3_index_res8', 'h3_lat_res8', 'h3_lon_res8',
                                   'timestamp', 'avg_traffic_volume'])

# Create traffic hexagon registry at resolution 7
traffic_registry = {}
for _, row in df_traffic[['h3_index_res8', 'h3_lat_res8', 'h3_lon_res8']].drop_duplicates().iterrows():
    if pd.notna(row['h3_index_res8']):
        hex7 = h3.cell_to_parent(row['h3_index_res8'], H3_RESOLUTION)
        if hex7 not in traffic_registry:
            traffic_registry[hex7] = {
                'hex7_id': hex7,
                'center_lat': row['h3_lat_res8'],
                'center_lon': row['h3_lon_res8']
            }

print(f"Found {len(traffic_registry)} traffic hexagons at resolution 7")

# Check overlap with PM2.5
pm25_with_traffic = set(pm25_registry.keys()) & set(traffic_registry.keys())
print(f"PM2.5 hexagons with local traffic data: {len(pm25_with_traffic)} ({len(pm25_with_traffic)/len(pm25_registry)*100:.1f}%)")

Loading traffic data...
Found 1017 traffic hexagons at resolution 7
PM2.5 hexagons with local traffic data: 26 (4.1%)


In [6]:
# Load weather data
print("Loading weather data...")
df_weather = pd.read_csv(f"{DATA_PATH}jp_openmeteo_processed_20230101_to_20231231.csv",
                          usecols=['h3_index_res8', 'h3_lat_res8', 'h3_lon_res8',
                                   'timestamp', 'temperature_c_mean', 'humidity_pct_mean',
                                   'precipitation_mm_mean'])

# Create weather hexagon registry at resolution 7
weather_registry = {}
for _, row in df_weather[['h3_index_res8', 'h3_lat_res8', 'h3_lon_res8']].drop_duplicates().iterrows():
    if pd.notna(row['h3_index_res8']):
        hex7 = h3.cell_to_parent(row['h3_index_res8'], H3_RESOLUTION)
        if hex7 not in weather_registry:
            weather_registry[hex7] = {
                'hex7_id': hex7,
                'center_lat': row['h3_lat_res8'],
                'center_lon': row['h3_lon_res8']
            }

print(f"Found {len(weather_registry)} weather hexagons at resolution 7")

# Check overlap with PM2.5
pm25_with_weather = set(pm25_registry.keys()) & set(weather_registry.keys())
print(f"PM2.5 hexagons with local weather data: {len(pm25_with_weather)} ({len(pm25_with_weather)/len(pm25_registry)*100:.1f}%)")

Loading weather data...
Found 536 weather hexagons at resolution 7
PM2.5 hexagons with local weather data: 8 (1.3%)


## Phase 4: Build Nearest Neighbor Lookup

In [7]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance between two points on Earth"""
    from math import radians, cos, sin, asin, sqrt
    
    # Convert to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    
    return c * r

def find_nearest_hexagon(target_hex, target_lat, target_lon, source_registry, max_distance_km=500):
    """Find nearest hexagon from source registry"""
    min_distance = float('inf')
    nearest_hex = None
    
    for source_hex, source_info in source_registry.items():
        if source_hex == target_hex:
            # Same hexagon - distance is 0
            return source_hex, 0.0
        
        distance = haversine_distance(target_lat, target_lon, 
                                       source_info['center_lat'], 
                                       source_info['center_lon'])
        
        if distance < min_distance and distance < max_distance_km:
            min_distance = distance
            nearest_hex = source_hex
    
    return nearest_hex, min_distance

In [8]:
# Build nearest neighbor lookup for each PM2.5 hexagon
print("Building nearest neighbor lookup table...")
print("This may take a few minutes...")

nearest_lookup = {}

for hex7, hex_info in tqdm(pm25_registry.items(), desc="Processing PM2.5 hexagons"):
    lat = hex_info['center_lat']
    lon = hex_info['center_lon']
    
    # Find nearest traffic hexagon
    nearest_traffic, traffic_distance = find_nearest_hexagon(hex7, lat, lon, traffic_registry)
    
    # Find nearest weather hexagon
    nearest_weather, weather_distance = find_nearest_hexagon(hex7, lat, lon, weather_registry)
    
    nearest_lookup[hex7] = {
        'nearest_traffic_hex': nearest_traffic,
        'traffic_distance_km': traffic_distance,
        'has_local_traffic': traffic_distance == 0,
        'nearest_weather_hex': nearest_weather,
        'weather_distance_km': weather_distance,
        'has_local_weather': weather_distance == 0
    }

# Convert to DataFrame for analysis
nearest_df = pd.DataFrame(nearest_lookup).T.reset_index()
nearest_df.rename(columns={'index': 'hex7_id'}, inplace=True)

print("\nNearest neighbor statistics:")
print(f"Hexagons with local traffic: {nearest_df['has_local_traffic'].sum()}")
print(f"Hexagons with local weather: {nearest_df['has_local_weather'].sum()}")
print(f"\nTraffic distance statistics (km):")
print(nearest_df[nearest_df['traffic_distance_km'] > 0]['traffic_distance_km'].describe())
print(f"\nWeather distance statistics (km):")
print(nearest_df[nearest_df['weather_distance_km'] > 0]['weather_distance_km'].describe())

Building nearest neighbor lookup table...
This may take a few minutes...


Processing PM2.5 hexagons: 100%|██████████| 634/634 [00:00<00:00, 1323.56it/s]


Nearest neighbor statistics:
Hexagons with local traffic: 26
Hexagons with local weather: 8

Traffic distance statistics (km):
count    608.000
unique   608.000
top        3.899
freq       1.000
Name: traffic_distance_km, dtype: float64

Weather distance statistics (km):
count    626.000
unique   626.000
top       32.302
freq       1.000
Name: weather_distance_km, dtype: float64





## Phase 5: Create Enriched Dataset

In [9]:
# Aggregate PM2.5 data to resolution 7
print("Aggregating PM2.5 data to resolution 7...")

df_pm25['hex7_id'] = df_pm25['h3_index_res8'].apply(
    lambda x: h3.cell_to_parent(x, H3_RESOLUTION) if pd.notna(x) else None
)

# Aggregate to hourly at hex7
df_pm25['timestamp'] = pd.to_datetime(df_pm25['timestamp'])
df_pm25['hour'] = df_pm25['timestamp'].dt.floor('H')

pm25_hourly = df_pm25.groupby(['hex7_id', 'hour']).agg({
    'pm25_ugm3_mean': 'mean',
    'h3_lat_res8': 'mean',
    'h3_lon_res8': 'mean'
}).reset_index()

pm25_hourly.rename(columns={
    'hour': 'timestamp',
    'h3_lat_res8': 'lat',
    'h3_lon_res8': 'lon'
}, inplace=True)

print(f"Created {len(pm25_hourly):,} hourly PM2.5 records for {pm25_hourly['hex7_id'].nunique()} hexagons")

Aggregating PM2.5 data to resolution 7...
Created 9,457,112 hourly PM2.5 records for 634 hexagons


In [10]:
# Prepare traffic data aggregation
print("\nAggregating traffic data to resolution 7...")

df_traffic['hex7_id'] = df_traffic['h3_index_res8'].apply(
    lambda x: h3.cell_to_parent(x, H3_RESOLUTION) if pd.notna(x) else None
)

df_traffic['timestamp'] = pd.to_datetime(df_traffic['timestamp'])
df_traffic['hour'] = df_traffic['timestamp'].dt.floor('H')

traffic_hourly = df_traffic.groupby(['hex7_id', 'hour']).agg({
    'avg_traffic_volume': 'mean'
}).reset_index()

traffic_hourly.rename(columns={'hour': 'timestamp'}, inplace=True)

print(f"Created {len(traffic_hourly):,} hourly traffic records for {traffic_hourly['hex7_id'].nunique()} hexagons")


Aggregating traffic data to resolution 7...
Created 8,677,621 hourly traffic records for 1017 hexagons


In [11]:
# Prepare weather data aggregation
print("\nAggregating weather data to resolution 7...")

df_weather['hex7_id'] = df_weather['h3_index_res8'].apply(
    lambda x: h3.cell_to_parent(x, H3_RESOLUTION) if pd.notna(x) else None
)

df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])
df_weather['hour'] = df_weather['timestamp'].dt.floor('H')

weather_hourly = df_weather.groupby(['hex7_id', 'hour']).agg({
    'temperature_c_mean': 'mean',
    'humidity_pct_mean': 'mean',
    'precipitation_mm_mean': 'mean'
}).reset_index()

weather_hourly.rename(columns={'hour': 'timestamp'}, inplace=True)

print(f"Created {len(weather_hourly):,} hourly weather records for {weather_hourly['hex7_id'].nunique()} hexagons")


Aggregating weather data to resolution 7...
Created 3,578,760 hourly weather records for 536 hexagons


In [None]:
import os
import numpy as np
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')

# Check if we already have saved enriched data
saved_file_csv = 'ml_data/pm25_enriched_hourly.csv'
saved_file_parquet = 'ml_data/pm25_enriched_hourly.parquet'

# Check for either CSV or Parquet
if os.path.exists(saved_file_parquet):
    print(f"Found existing enriched data file: {saved_file_parquet}")
    print("Loading saved data...")
    enriched_data = pd.read_parquet(saved_file_parquet)
    print(f"Loaded {len(enriched_data):,} records from saved file")
    print("Skipping enrichment process. Delete the file if you want to reprocess.")
elif os.path.exists(saved_file_csv):
    print(f"Found existing enriched data file: {saved_file_csv}")
    print("Loading saved data...")
    enriched_data = pd.read_csv(saved_file_csv)
    print(f"Loaded {len(enriched_data):,} records from saved file")
    print("Skipping enrichment process. Delete the file if you want to reprocess.")
else:
    # Merge data for PM2.5 hexagons with enrichment
    print("\nCreating enriched dataset...")
    
    # Start with PM2.5 data
    enriched_data = pm25_hourly.copy()
    
    # Add nearest neighbor information
    enriched_data = enriched_data.merge(nearest_df, on='hex7_id', how='left')
    
    # Create global lookups for faster access
    print("Creating lookup dictionaries for faster processing...")
    
    # Create traffic lookup: (hex_id, timestamp) -> data
    traffic_lookup = {}
    for _, row in traffic_hourly.iterrows():
        key = (row['hex7_id'], row['timestamp'])
        traffic_lookup[key] = row['avg_traffic_volume']
    
    # Create weather lookup: (hex_id, timestamp) -> data
    weather_lookup = {}
    for _, row in weather_hourly.iterrows():
        key = (row['hex7_id'], row['timestamp'])
        weather_lookup[key] = {
            'temperature_c_mean': row['temperature_c_mean'],
            'humidity_pct_mean': row['humidity_pct_mean'],
            'precipitation_mm_mean': row['precipitation_mm_mean']
        }
    
    print(f"Created lookups: {len(traffic_lookup):,} traffic, {len(weather_lookup):,} weather entries")
    
    # Function to process a single row
    def process_row(row_data):
        """Process a single row for both traffic and weather enrichment"""
        idx, row = row_data
        
        # Process traffic
        if row['has_local_traffic']:
            source_hex = row['hex7_id']
        else:
            source_hex = row['nearest_traffic_hex']
        
        traffic_result = {}
        if pd.notna(source_hex):
            key = (source_hex, row['timestamp'])
            if key in traffic_lookup:
                traffic_result['avg_traffic_volume'] = traffic_lookup[key]
                traffic_result['traffic_source'] = 'local' if row['has_local_traffic'] else 'nearest'
            else:
                traffic_result['avg_traffic_volume'] = np.nan
                traffic_result['traffic_source'] = 'missing'
        else:
            traffic_result['avg_traffic_volume'] = np.nan
            traffic_result['traffic_source'] = np.nan
        
        # Process weather
        if row['has_local_weather']:
            source_hex = row['hex7_id']
        else:
            source_hex = row['nearest_weather_hex']
        
        weather_result = {}
        if pd.notna(source_hex):
            key = (source_hex, row['timestamp'])
            if key in weather_lookup:
                weather_data = weather_lookup[key]
                weather_result.update(weather_data)
                weather_result['weather_source'] = 'local' if row['has_local_weather'] else 'nearest'
            else:
                weather_result['temperature_c_mean'] = np.nan
                weather_result['humidity_pct_mean'] = np.nan
                weather_result['precipitation_mm_mean'] = np.nan
                weather_result['weather_source'] = 'missing'
        else:
            weather_result['temperature_c_mean'] = np.nan
            weather_result['humidity_pct_mean'] = np.nan
            weather_result['precipitation_mm_mean'] = np.nan
            weather_result['weather_source'] = np.nan
        
        # Combine results
        combined = {**traffic_result, **weather_result}
        return idx, combined
    
    # Process full dataset or sample
    USE_SAMPLE = False  # Process FULL dataset
    SAMPLE_SIZE = 10000
    
    if USE_SAMPLE:
        print(f"Using sample of {SAMPLE_SIZE:,} records for faster processing...")
        enriched_data = enriched_data.head(SAMPLE_SIZE)
    else:
        print("Processing FULL dataset...")
    
    print("\nProcessing data with parallel enrichment...")
    total_records = len(enriched_data)
    print(f"Total records to process: {total_records:,}")
    
    # Determine number of jobs
    n_jobs = min(8, -1)  # Use up to 8 cores or all available
    print(f"Using parallel processing with {n_jobs} jobs")
    print("="*60)
    
    print("\n[PARALLEL] PROCESSING TRAFFIC AND WEATHER ENRICHMENT")
    print("-"*60)
    
    # Prepare data for processing
    row_data = list(enriched_data.iterrows())
    
    # Process in parallel with progress bar
    print("Processing records...")
    print("This may take several minutes for the full dataset...")
    
    results = Parallel(n_jobs=n_jobs, backend='threading')(
        delayed(process_row)(row) 
        for row in tqdm(row_data, desc="Enriching data")
    )
    
    # Sort results by index to maintain order
    results.sort(key=lambda x: x[0])
    
    # Extract enrichment data
    enrichment_data = [result[1] for result in results]
    enrichment_df = pd.DataFrame(enrichment_data)
    
    # Add enrichments to the main dataframe
    enriched_data = pd.concat([enriched_data, enrichment_df], axis=1)
    
    print(f"\n✓ Enrichment complete!")
    
    print("\n" + "="*60)
    print(f"✓ ENRICHMENT FINISHED: Created dataset with {len(enriched_data):,} records")
    print("="*60)
    
    # Create ml_data directory if it doesn't exist
    os.makedirs('ml_data', exist_ok=True)
    
    # Save as both Parquet (fast) and CSV (if needed)
    print(f"\nSaving enriched data...")
    
    # Save as Parquet first (much faster and smaller)
    print(f"Saving as Parquet to {saved_file_parquet}...")
    enriched_data.to_parquet(saved_file_parquet, index=False, compression='snappy')
    parquet_size_mb = os.path.getsize(saved_file_parquet) / 1024 / 1024
    print(f"✓ Parquet saved! File size: {parquet_size_mb:.1f} MB")
    
    # Optionally save as CSV (this will be slow for large datasets)
    SAVE_CSV = True  # Set to True if you really need CSV format
    
    if SAVE_CSV:
        print(f"\nSaving as CSV to {saved_file_csv}...")
        print("WARNING: Saving 9M+ records to CSV will take several minutes...")
        
        # Save in chunks for better performance and progress tracking
        chunk_size = 500000  # Save 500k rows at a time
        n_chunks = (len(enriched_data) + chunk_size - 1) // chunk_size
        
        for i in tqdm(range(n_chunks), desc="Saving CSV chunks"):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, len(enriched_data))
            chunk = enriched_data.iloc[start_idx:end_idx]
            
            # Write header only for first chunk
            mode = 'w' if i == 0 else 'a'
            header = i == 0
            
            chunk.to_csv(saved_file_csv, mode=mode, header=header, index=False)
        
        csv_size_mb = os.path.getsize(saved_file_csv) / 1024 / 1024
        print(f"✓ CSV saved! File size: {csv_size_mb:.1f} MB")
    else:
        print("\nSkipping CSV save (set SAVE_CSV=True if needed)")
        print("Parquet format is recommended for large datasets (faster & smaller)")
    
    # Also save lookup table to ml_data folder
    if 'nearest_lookup' in locals():
        lookup_file = 'ml_data/hexagon_lookup_table.json'
        with open(lookup_file, 'w') as f:
            json.dump(nearest_lookup, f, indent=2)
        print(f"✓ Saved hexagon lookup table to {lookup_file}")
    
    # Final summary
    print("\n" + "="*60)
    print("PROCESSING COMPLETE!")
    print("="*60)
    print(f"  Total records processed: {total_records:,}")
    print(f"  Output file: {saved_file_parquet}")
    print(f"  File size: {parquet_size_mb:.1f} MB")
    print("="*60)


Creating enriched dataset...
Creating lookup dictionaries for faster processing...
Created lookups: 8,677,621 traffic, 3,578,760 weather entries
Processing FULL dataset...

Processing data with parallel enrichment...
Total records to process: 9,457,112
Using parallel processing with -1 jobs

[PARALLEL] PROCESSING TRAFFIC AND WEATHER ENRICHMENT
------------------------------------------------------------
Processing records...
This may take several minutes for the full dataset...


Enriching data:   0%|          | 0/9457112 [00:00<?, ?it/s]


✓ Enrichment complete!

✓ ENRICHMENT FINISHED: Created dataset with 9,457,112 records

Saving enriched data...
Saving as Parquet to ml_data/pm25_enriched_hourly.parquet...
✓ Parquet saved! File size: 31.9 MB

Saving as CSV to ml_data/pm25_enriched_hourly.csv...


Saving CSV chunks:   0%|          | 0/19 [00:00<?, ?it/s]

In [None]:
print("Adding temporal features...")

enriched_data['hour'] = enriched_data['timestamp'].dt.hour
enriched_data['day_of_week'] = enriched_data['timestamp'].dt.dayofweek
enriched_data['month'] = enriched_data['timestamp'].dt.month
enriched_data['is_weekend'] = (enriched_data['day_of_week'] >= 5).astype(int)

# Cyclical encoding
enriched_data['hour_sin'] = np.sin(2 * np.pi * enriched_data['hour'] / 24)
enriched_data['hour_cos'] = np.cos(2 * np.pi * enriched_data['hour'] / 24)
enriched_data['dow_sin'] = np.sin(2 * np.pi * enriched_data['day_of_week'] / 7)
enriched_data['dow_cos'] = np.cos(2 * np.pi * enriched_data['day_of_week'] / 7)
enriched_data['month_sin'] = np.sin(2 * np.pi * enriched_data['month'] / 12)
enriched_data['month_cos'] = np.cos(2 * np.pi * enriched_data['month'] / 12)

## Phase 6: Data Quality Analysis

In [None]:
print("Data Quality Analysis")
print("="*60)

print("\nPM2.5 Coverage:")
print(f"  Total hexagons: {enriched_data['hex7_id'].nunique()}")
print(f"  Total records: {len(enriched_data):,}")
print(f"  Missing PM2.5: {enriched_data['pm25_ugm3_mean'].isna().mean():.1%}")

print("\nTraffic Data Sources:")
if 'traffic_source' in enriched_data.columns:
    print(enriched_data['traffic_source'].value_counts())
    nearest_traffic = enriched_data[enriched_data['traffic_source']=='nearest']['traffic_distance_km']
    if len(nearest_traffic) > 0:
        # Use .values to get the actual mean value
        print(f"\nAverage distance to traffic data: {nearest_traffic.values.mean():.1f} km")
else:
    print("  Traffic source information not available")

print("\nWeather Data Sources:")
if 'weather_source' in enriched_data.columns:
    print(enriched_data['weather_source'].value_counts())
    nearest_weather = enriched_data[enriched_data['weather_source']=='nearest']['weather_distance_km']
    if len(nearest_weather) > 0:
        # Use .values to get the actual mean value
        print(f"\nAverage distance to weather data: {nearest_weather.values.mean():.1f} km")
else:
    print("  Weather source information not available")

print("\nFeature Completeness:")
for col in enriched_data.columns:
    if col not in ['hex7_id', 'timestamp', 'nearest_traffic_hex', 'nearest_weather_hex']:
        missing_pct = enriched_data[col].isna().mean() * 100
        if missing_pct > 0:
            print(f"  {col}: {100-missing_pct:.1f}% complete")

## Phase 7: Save Enriched Dataset

In [None]:
import os

print("Saved datasets in ml_data folder:")
print("="*60)

# Check Parquet file
parquet_file = 'ml_data/pm25_enriched_hourly.parquet'
if os.path.exists(parquet_file):
    size_mb = os.path.getsize(parquet_file) / 1024 / 1024
    print(f"✓ Parquet file: {parquet_file}")
    print(f"  Size: {size_mb:.1f} MB")
    # Quick load to check shape
    temp_df = pd.read_parquet(parquet_file)
    print(f"  Records: {len(temp_df):,}")
    print(f"  Columns: {len(temp_df.columns)}")
    del temp_df  # Free memory

# Check CSV file
csv_file = 'ml_data/pm25_enriched_hourly.csv'
if os.path.exists(csv_file):
    size_mb = os.path.getsize(csv_file) / 1024 / 1024
    print(f"\n✓ CSV file: {csv_file}")
    print(f"  Size: {size_mb:.1f} MB")

# Check lookup table
lookup_file = 'ml_data/hexagon_lookup_table.json'
if os.path.exists(lookup_file):
    size_kb = os.path.getsize(lookup_file) / 1024
    print(f"\n✓ Lookup table: {lookup_file}")
    print(f"  Size: {size_kb:.1f} KB")

print("\n" + "="*60)
print("Use pd.read_parquet('ml_data/pm25_enriched_hourly.parquet') to load the data")

## Phase 8: Query Interface

In [None]:
class PM25QueryInterface:
    def __init__(self, enriched_data, pm25_registry):
        self.data = enriched_data
        self.registry = pm25_registry
        self.hex_locations = {hex_id: (info['center_lat'], info['center_lon']) 
                              for hex_id, info in pm25_registry.items()}
    
    def find_nearest_pm25_hexagon(self, query_lat, query_lon):
        """Find the nearest PM2.5 hexagon to a query location"""
        min_distance = float('inf')
        nearest_hex = None
        
        for hex_id, (lat, lon) in self.hex_locations.items():
            distance = haversine_distance(query_lat, query_lon, lat, lon)
            if distance < min_distance:
                min_distance = distance
                nearest_hex = hex_id
        
        return nearest_hex, min_distance
    
    def get_confidence_score(self, distance_km):
        """Calculate confidence score based on distance"""
        if distance_km < 5:
            return 'high', 0.9
        elif distance_km < 20:
            return 'medium', 0.7
        elif distance_km < 50:
            return 'low', 0.5
        else:
            return 'very_low', 0.3
    
    def query_location(self, lat, lon, timestamp=None):
        """Query PM2.5 data for a specific location"""
        
        # Find nearest PM2.5 hexagon
        nearest_hex, distance_km = self.find_nearest_pm25_hexagon(lat, lon)
        
        # Get confidence score
        confidence_level, confidence_score = self.get_confidence_score(distance_km)
        
        # Get data for the hexagon
        hex_data = self.data[self.data['hex7_id'] == nearest_hex]
        
        if timestamp:
            # Get data for specific timestamp
            timestamp = pd.to_datetime(timestamp).floor('H')
            hex_data = hex_data[hex_data['timestamp'] == timestamp]
        
        result = {
            'query_location': {'lat': lat, 'lon': lon},
            'nearest_hexagon': nearest_hex,
            'distance_km': distance_km,
            'confidence_level': confidence_level,
            'confidence_score': confidence_score,
            'data_available': len(hex_data) > 0
        }
        
        if len(hex_data) > 0:
            if timestamp:
                result['pm25_value'] = hex_data['pm25_ugm3_mean'].iloc[0]
                result['traffic_volume'] = hex_data['avg_traffic_volume'].iloc[0]
                result['temperature'] = hex_data['temperature_c_mean'].iloc[0]
                result['humidity'] = hex_data['humidity_pct_mean'].iloc[0]
            else:
                result['pm25_mean'] = hex_data['pm25_ugm3_mean'].mean()
                result['pm25_std'] = hex_data['pm25_ugm3_mean'].std()
                result['data_points'] = len(hex_data)
        
        return result

# Create query interface
query_interface = PM25QueryInterface(enriched_data, pm25_registry)
print("Query interface created")

In [None]:
# Test the query interface
print("Testing query interface...\n")

# Test locations
test_locations = [
    (35.6762, 139.6503, "Tokyo Station"),
    (34.6937, 135.5023, "Osaka"),
    (43.0642, 141.3469, "Sapporo"),
    (35.0, 140.0, "Rural area")
]

for lat, lon, name in test_locations:
    result = query_interface.query_location(lat, lon)
    print(f"{name} ({lat:.2f}, {lon:.2f}):")
    print(f"  Nearest PM2.5 sensor: {result['distance_km']:.1f} km away")
    print(f"  Confidence: {result['confidence_level']} ({result['confidence_score']:.1f})")
    if result['data_available']:
        print(f"  Average PM2.5: {result.get('pm25_mean', 'N/A'):.1f} μg/m³")
    print()

## Phase 9: Visualization

In [None]:
# Create visualization of PM2.5 coverage and data enrichment
print("Creating coverage map...")

# Create base map
center_lat = enriched_data['lat'].mean()
center_lon = enriched_data['lon'].mean()

coverage_map = folium.Map(
    location=[center_lat, center_lon],
    zoom_start=6,
    tiles='OpenStreetMap'
)

# Add PM2.5 hexagons
for hex_id, info in list(pm25_registry.items())[:500]:  # Limit for performance
    # Get hexagon boundary
    try:
        boundary = h3.cell_to_boundary(hex_id)
        
        # Check data quality
        hex_lookup = nearest_lookup.get(hex_id, {})
        has_local_traffic = hex_lookup.get('has_local_traffic', False)
        has_local_weather = hex_lookup.get('has_local_weather', False)
        
        # Color based on data availability
        if has_local_traffic and has_local_weather:
            color = '#00FF00'  # Green - all local data
            fill_opacity = 0.6
        elif has_local_traffic or has_local_weather:
            color = '#FFFF00'  # Yellow - some local data
            fill_opacity = 0.5
        else:
            color = '#FF0000'  # Red - no local auxiliary data
            fill_opacity = 0.4
        
        folium.Polygon(
            locations=boundary,
            color=color,
            weight=1,
            fill=True,
            fillColor=color,
            fillOpacity=fill_opacity,
            popup=f"""Hexagon: {hex_id}<br>
                     Measurements: {info['measurement_count']}<br>
                     Local Traffic: {has_local_traffic}<br>
                     Local Weather: {has_local_weather}""",
            tooltip=f"PM2.5 Hexagon"
        ).add_to(coverage_map)
    except:
        continue

# Add legend
legend_html = '''
<div style="position: fixed; 
            bottom: 50px; right: 50px; width: 250px;
            background-color: white; z-index:9999; font-size:14px;
            border:2px solid grey; border-radius: 5px; padding: 10px">
<p style="margin: 0;"><b>PM2.5 Hexagon Data Quality</b></p>
<hr style="margin: 5px 0;">
<p style="margin: 5px 0;"><span style="color: #00FF00;">■</span> Green: Has local traffic & weather</p>
<p style="margin: 5px 0;"><span style="color: #FFFF00;">■</span> Yellow: Has some local data</p>
<p style="margin: 5px 0;"><span style="color: #FF0000;">■</span> Red: Uses nearest neighbor data</p>
</div>
'''

coverage_map.get_root().html.add_child(folium.Element(legend_html))

# Save map
coverage_map.save('pm25_enrichment_coverage_map.html')
print("Map saved as pm25_enrichment_coverage_map.html")

# Display map
coverage_map

## Summary

In [None]:
print("="*80)
print("PM2.5 HEXAGON ENRICHMENT COMPLETE")
print("="*80)
print(f"\nCreated enriched dataset with:")
print(f"  - {enriched_data['hex7_id'].nunique()} PM2.5 hexagons at H3 resolution 7")
print(f"  - {len(enriched_data):,} hourly records")
print(f"  - {len(pm25_with_traffic)} hexagons with local traffic data")
print(f"  - {len(pm25_with_weather)} hexagons with local weather data")
print(f"  - Nearest neighbor approximation for remaining hexagons")
print(f"\nOutput files:")
print(f"  - pm25_enriched_hourly.parquet: Enriched dataset")
print(f"  - hexagon_lookup_table.json: Nearest neighbor lookups")
print(f"  - pm25_enrichment_coverage_map.html: Coverage visualization")
print(f"\nQuery interface ready for location-based PM2.5 lookups")