# Kaggle Notebook: GIS-based Legality + YOLO Fine-tuning

This notebook helps you:
- Install dependencies on Kaggle
- Load/export dataset from a private Kaggle Dataset
- Optionally fine-tune YOLO on your 5-class encroachment labels
- Visualize overlays and run batch inference

In [None]:
# 1) Set Up Environment and Dependencies
!pip -q install ultralytics shapely pyproj geopandas folium kaggle --upgrade

import os, sys, json, platform
import torch
print({
    'python': sys.version,
    'platform': platform.platform(),
    'cuda_available': torch.cuda.is_available(),
    'device_count': torch.cuda.device_count(),
})

# Capture requirements for reproducibility (optional)
!pip freeze | grep -E 'ultralytics|shapely|pyproj|geopandas|folium|torch'

In [None]:
# 2) Define Classes, Subclasses, and Config
from enum import Enum
from dataclasses import dataclass
from typing import Optional, Dict, Any
import yaml

class TopClass(str, Enum):
    Permanent_Legal = 'Permanent_Legal'
    Permanent_Illegal = 'Permanent_Illegal'
    Temporary_Legal = 'Temporary_Legal'
    Temporary_Illegal = 'Temporary_Illegal'
    Natural_Area = 'Natural_Area'

@dataclass
class Attributes:
    permanence: Optional[str] = None  # 'permanent' | 'temporary'
    zone: Optional[str] = None        # 'river_buffer'| 'road_footpath' | 'vending_zone' | 'festival_zone' | 'none'
    permit_status: Optional[str] = None  # 'approved' | 'unapproved' | 'unknown'
    area_type: Optional[str] = None   # 'natural_area' | 'built'
    structure_type: Optional[str] = None

# Load ontology rules (optional)
ONTOLOGY_PATH = '/kaggle/working/ontology.yaml'

ontology_fallback = {
    'class_ids': {
        'Permanent_Legal': 0, 'Permanent_Illegal': 1, 'Temporary_Legal': 2, 'Temporary_Illegal': 3, 'Natural_Area': 4
    }
}

def load_ontology(path: str):
    try:
        with open(path, 'r') as f:
            return yaml.safe_load(f)
    except Exception:
        return ontology_fallback

ontology = load_ontology(ONTOLOGY_PATH)
print('Ontology loaded:', list(ontology.get('class_ids', {}).keys()))

In [None]:
# 3) Load or Synthesize GIS Polygons
import geopandas as gpd
from shapely.geometry import Polygon

# Try to load from dataset if provided, else synthesize simple boxes
DATASET_DIR = '/kaggle/input/YOUR_DATASET_NAME/dataset'  # TODO: set your Kaggle dataset path
GEOJSON_PATH = os.path.join(DATASET_DIR, 'zones.geojson')

if os.path.exists(GEOJSON_PATH):
    zones_gdf = gpd.read_file(GEOJSON_PATH)
    assert zones_gdf.crs, 'zones.geojson missing CRS'
else:
    # Synthesize example zones in EPSG:4326
    zones = [
        ('river_buffer', Polygon([(75.75, 23.10), (75.80, 23.10), (75.80, 23.15), (75.75, 23.15)])),
        ('road_footpath', Polygon([(75.70, 23.08), (75.85, 23.08), (75.85, 23.09), (75.70, 23.09)])),
        ('vending_zone', Polygon([(75.77, 23.11), (75.78, 23.11), (75.78, 23.12), (75.77, 23.12)])),
        ('festival_zone', Polygon([(75.79, 23.12), (75.80, 23.12), (75.80, 23.13), (75.79, 23.13)])),
        ('natural_area', Polygon([(75.72, 23.13), (75.74, 23.13), (75.74, 23.16), (75.72, 23.16)])),
    ]
    zones_gdf = gpd.GeoDataFrame({'name':[n for n,_ in zones]}, geometry=[g for _,g in zones], crs='EPSG:4326')

zones_gdf.head()

In [None]:
# 4) Load or Synthesize Detected Objects with Geo Coordinates
import pandas as pd
from shapely.geometry import Point

DETECTIONS_CSV = os.path.join(DATASET_DIR, 'detections.csv')
if os.path.exists(DETECTIONS_CSV):
    df = pd.read_csv(DETECTIONS_CSV)
else:
    import numpy as np
    rng = np.random.default_rng(42)
    N = 200
    lons = rng.uniform(75.70, 75.82, N)
    lats = rng.uniform(23.08, 23.16, N)
    structure_type = rng.choice(['permanent', 'temporary'], N)
    permit = rng.choice(['approved', 'unapproved', 'unknown'], N, p=[0.2, 0.4, 0.4])
    df = pd.DataFrame({'id': range(N), 'lon': lons, 'lat': lats, 'permanence': structure_type, 'permit_status': permit})

points_gdf = gpd.GeoDataFrame(df, geometry=[Point(xy) for xy in zip(df['lon'], df['lat'])], crs='EPSG:4326')
points_gdf.head()

In [None]:
# 5) Geospatial Overlay and Rule Engine
# Priority: river_buffer -> road_footpath -> vending/festival -> natural_area -> none

# Spatial join
joined = gpd.sjoin(points_gdf, zones_gdf.rename(columns={'name':'zone'}), how='left', predicate='within')
joined['zone'] = joined['zone'].fillna('none')

# Apply rule engine

def decide_legality(row):
    zone = row['zone']
    perm = row.get('permanence', 'temporary')
    permit = row.get('permit_status', 'unknown')

    if zone == 'natural_area':
        return TopClass.Natural_Area.value
    if zone in ('river_buffer', 'road_footpath'):
        return TopClass.Permanent_Illegal.value if perm == 'permanent' else TopClass.Temporary_Illegal.value
    if zone in ('vending_zone', 'festival_zone'):
        if permit == 'approved':
            return TopClass.Temporary_Legal.value if perm == 'temporary' else TopClass.Permanent_Legal.value
        else:
            return TopClass.Temporary_Illegal.value if perm == 'temporary' else TopClass.Permanent_Illegal.value
    # outside all zones -> default by permanence + permit
    if perm == 'permanent':
        return TopClass.Permanent_Legal.value if permit == 'approved' else TopClass.Permanent_Illegal.value
    else:
        return TopClass.Temporary_Illegal.value

joined['top_class'] = joined.apply(decide_legality, axis=1)
joined[['id','zone','permanence','permit_status','top_class']].head()

In [None]:
# 6) Label Encoding and Dataset Assembly
from sklearn.model_selection import train_test_split

cls2id = ontology.get('class_ids', {c.value:i for i,c in enumerate(TopClass)})
joined['target_id'] = joined['top_class'].map(cls2id)

features = joined[['lon','lat']].copy()
features['zone'] = joined['zone']
X = pd.get_dummies(features, columns=['zone'])
y = joined['target_id']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

out_dir = '/kaggle/working/data'
os.makedirs(out_dir, exist_ok=True)
joined.to_parquet(os.path.join(out_dir, 'labeled_points.parquet'))
X_train.to_parquet(os.path.join(out_dir, 'X_train.parquet'))
X_val.to_parquet(os.path.join(out_dir, 'X_val.parquet'))
y_train.to_frame('y').to_parquet(os.path.join(out_dir, 'y_train.parquet'))
y_val.to_frame('y').to_parquet(os.path.join(out_dir, 'y_val.parquet'))

print('Saved dataset to', out_dir)

In [None]:
# 7) Visualize Polygons and Labeled Objects on a Map
import folium

center = [joined['lat'].mean(), joined['lon'].mean()]
mp = folium.Map(location=center, zoom_start=13)

# Add polygons
for _, r in zones_gdf.iterrows():
    gj = folium.GeoJson(r['geometry'].__geo_interface__, name=r.get('name','zone'))
    gj.add_to(mp)

# Add points colored by class
color_map = {
    'Permanent_Legal': 'green',
    'Permanent_Illegal': 'red',
    'Temporary_Legal': 'blue',
    'Temporary_Illegal': 'orange',
    'Natural_Area': 'gray'
}
for _, r in joined.sample(min(len(joined), 200), random_state=42).iterrows():
    folium.CircleMarker(location=[r['lat'], r['lon']], radius=3, color=color_map[r['top_class']], fill=True, fill_opacity=0.7).add_to(mp)

folium.LayerControl().add_to(mp)
mp

In [None]:
# 8) Unit Tests for Overlay Logic (inline)
# Simple inline checks; for real projects, export to tests/ and run pytest.

# Boundary test: pick a boundary point
bpoly = zones_gdf.iloc[0].geometry
bpt = bpoly.boundary.interpolate(0.5, normalized=True)
bdf = gpd.GeoDataFrame([{'lon': bpt.x, 'lat': bpt.y}], geometry=[bpt], crs='EPSG:4326')
res = gpd.sjoin(bdf, zones_gdf.rename(columns={'name':'zone'}), how='left', predicate='within')
assert res['zone'].isna().all(), 'Point on boundary should not be within by strict within() predicate'

# Overlap priority check (synthetic): ensure river_buffer takes precedence in our rules
row = {'zone':'river_buffer','permanence':'temporary','permit_status':'unknown'}
assert decide_legality(row) == 'Temporary_Illegal'

print('Inline tests passed.')

In [None]:
# 9) Export Dataset and Code to Kaggle (API)
# Requires Kaggle credentials. On Kaggle, this is often not needed since we run within the kernel.

from pathlib import Path
try:
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi(); api.authenticate()
    print('Kaggle API authenticated')
except Exception as e:
    print('Kaggle API not available or not authenticated:', e)

# Example: create version if running locally with credentials (skip inside Kaggle kernel)
# DATASET_SLUG = 'encroachment-zones-labeled'
# owner_slug = '<your-username>'
# meta = {
#   'title': 'Encroachment GIS Labeled Points',
#   'id': f'{owner_slug}/{DATASET_SLUG}',
#   'licenses': [{ 'name': 'CC0-1.0' }]
# }
# with open('/kaggle/working/dataset-metadata.json', 'w') as f:
#     json.dump(meta, f, indent=2)
# api.dataset_create_version('/kaggle/working', 'Initial version', quiet=False)

In [None]:
# 10) Create and Push Kaggle Kernel (Template)
# Typically managed via web UI or local Kaggle API. Skipping execution here.
print('Kernel push is typically done from local with kaggle CLI. See README for steps.')

In [None]:
# 11) Optional: Fine-tune YOLO on Kaggle
from ultralytics import YOLO

DATA_YAML = os.path.join(DATASET_DIR, 'data.yaml')
print('Using data.yaml:', DATA_YAML)

try:
    model = YOLO('yolov8n.pt')  # or yolov8s.pt
    results = model.train(data=DATA_YAML, epochs=20, imgsz=640, batch=16, patience=10)
    model.val()
    os.makedirs('/kaggle/working/weights', exist_ok=True)
    # Ultralytics saves runs to runs/detect/train*/weights/best.pt
    import glob, shutil
    bests = sorted(glob.glob('runs/detect/*/weights/best.pt'))
    if bests:
        best = bests[-1]
        shutil.copy(best, '/kaggle/working/weights/best.pt')
        print('Saved /kaggle/working/weights/best.pt')
except Exception as e:
    print('YOLO training skipped or failed:', e)

# Show some predictions (if val images available)
VAL_DIR = os.path.join(DATASET_DIR, 'images', 'val')
if os.path.isdir(VAL_DIR):
    try:
        preds = model.predict(source=VAL_DIR, conf=0.25, save=True, max_det=200)
        print('Predictions saved under runs/detect/predict*')
    except Exception as e:
        print('Prediction step skipped:', e)

In [None]:
# 12) Batch Inference Pipeline (CLI-like)
import argparse

def classify_points_csv(input_csv: str, out_csv: str):
    df = pd.read_csv(input_csv)
    gdf = gpd.GeoDataFrame(df, geometry=[Point(xy) for xy in zip(df['lon'], df['lat'])], crs='EPSG:4326')
    res = gpd.sjoin(gdf, zones_gdf.rename(columns={'name':'zone'}), how='left', predicate='within')
    res['zone'] = res['zone'].fillna('none')
    res['top_class'] = res.apply(decide_legality, axis=1)
    res.drop(columns=['geometry'], inplace=True)
    res.to_csv(out_csv, index=False)
    print('Wrote', out_csv)

# Example usage (uncomment to run):
# classify_points_csv('/kaggle/input/some-new-detections.csv', '/kaggle/working/labeled.csv')