## Local embeddings modeling notebook

In [9]:
%load_ext autoreload
%autoreload 2

import geopandas as gpd
import duckdb
import os


from datetime import datetime
import json
import os

import geopandas as gpd
import ipyleaflet as ipyl
from IPython.display import display
import ipywidgets as ipyw
import numpy as np
import pandas as pd

import sys
import pathlib
sys.path.insert(0, 'src')

from ui import GeoLabeler

with open('config/ui_config.json', 'r') as f:
    config = json.load(f)

local_dir = config['local_dir']
duckdb_path = '/Users/christopherren/geovibes/data/bali_test_cosine.db'
embeddings_con = duckdb.connect(duckdb_path)
valid_tile_dir = os.path.join(local_dir, 'tiles')

mgrs_ids = config['mgrs_ids']
start_date = config['start_date']
end_date = config['end_date']
imagery = config['imagery']



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from ui import GeoLabeler


BOUNDARY_PATH = os.path.join(pathlib.Path().resolve(), "places/bali.geojson")
BOUNDARY = gpd.read_file(BOUNDARY_PATH)
maptiler_attribution = '<a href="https://www.maptiler.com/copyright/" target="_blank">&copy; MapTiler</a> <a href="https://www.openstreetmap.org/copyright" target="_blank">&copy; OpenStreetMap contributors</a>'

labeler = GeoLabeler(
                    geojson_path=BOUNDARY_PATH,
                    mgrs_ids=mgrs_ids,
                    start_date=start_date,
                    end_date=end_date,
                    imagery=imagery,
                    duckdb_connection=embeddings_con,
                    attribution=maptiler_attribution)


label = ipyw.Label(); display(label)  

def handle_mouse_move(**kwargs):
    lat, lon = kwargs.get('coordinates')
    label_type = "Erase" if labeler.select_val == -100 else "Negative" if labeler.select_val == 0 else "Positive"
    label.value = f'Lat/lon: {lat:.4f}, {lon:.4f}. Mode: {"lasso" if labeler.lasso_mode else "single"}. Labeling: {label_type}'

labeler.map.on_interaction(handle_mouse_move)

Initializing GeoLabeler...
Adding controls...


VBox(children=(Map(center=[-8.366372861538489, 115.11229209899712], controls=(ZoomControl(options=['position',‚Ä¶

Label(value='')

Updated query vector from 1 positive and 0 negative labels
Updated query vector from 1 positive and 0 negative labels
üîç Searching for 1000 similar points...
                  id                                          embedding  \
2  25765131848729008  [2.6578941, 0.72800344, 2.9949596, -0.05947188...   
3  25759346512376255  [2.6777856, 1.438526, 3.3256977, 0.27770936, 1...   
4  25765078677449737  [2.4371445, 0.22077091, 2.8739295, 0.0785759, ...   
5  25765054555882018  [2.3249838, 0.3221346, 3.7210858, 0.11815513, ...   
6  25765079093888272  [2.9732482, 1.7507468, 3.4482102, 0.5824672, 0...   

                                       geometry_json  \
2  {"type":"Point","coordinates":[114.64081619475...   
3  {"type":"Point","coordinates":[115.24738563410...   
4  {"type":"Point","coordinates":[114.41286733583...   
5  {"type":"Point","coordinates":[114.11842455209...   
6  {"type":"Point","coordinates":[114.41735597155...   

                                    geometry_wkt   d

## Search
First search make take a while as the table is loaded into memory

## Export

In [12]:
# Export the positives and negatives
pos_export = labeler.gdf.loc[labeler.pos_ids]
neg_export = labeler.gdf.loc[labeler.neg_ids]

# Add label columns
pos_export['label'] = 1
neg_export['label'] = 0




In [None]:
EXPORT_TYPE = "POSITIVE" # "FULL" or "POSITIVE"

if EXPORT_TYPE == "FULL":
# Combine into one gdf
    export_gdf = pd.concat([pos_export, neg_export], ignore_index=True)

elif EXPORT_TYPE == "POSITIVE":
    # Combine into one gdf
    export_gdf = pos_export

# Export to a parquet file
export_path = os.path.join(local_dir, f'{EXPORT_TYPE}_labels.parquet'.lower())
export_gdf.to_parquet(export_path, index=False)

## Classifer 
After export, please return to the README to follow the next steps through model training, inference, and post-processing.

# Load

## Option 1: Load a previously saved set of labels

In [5]:
# Helper function

def display_labels_on_labeler(labeler, labels_gdf):
    """
    Display positive and negative labels on the GeoLabeler instance.

    Args:
        labeler (GeoLabeler): The GeoLabeler instance.
        labels_gdf (GeoDataFrame): The GeoDataFrame containing labels.
    """
    if labels_gdf is not None:
        pos_tile_ids = labels_gdf.loc[labels_gdf['label'] == 1, 'tile_id'].tolist()
        neg_tile_ids = labels_gdf.loc[labels_gdf['label'] == 0, 'tile_id'].tolist()

        # Get index values from labeler's GeoDataFrame where tile_id is in pos_tile_ids and neg_tile_ids
        pos_indices = labeler.gdf[labeler.gdf['tile_id'].isin(pos_tile_ids)].index.tolist()
        neg_indices = labeler.gdf[labeler.gdf['tile_id'].isin(neg_tile_ids)].index.tolist()

        # Update labeler with positive and negative IDs
        labeler.pos_ids = pos_indices
        labeler.neg_ids = neg_indices

        # Update the map layers to reflect loaded labels
        labeler.update_layers()
        print("Labels displayed on labeler.")
    else:
        print("No labels to display.")



In [None]:
# Load previously exported labels

labels_file_path = os.path.join(local_dir, 'full_labels.parquet')
if os.path.exists(labels_file_path):
    labels_gdf = gpd.read_parquet(labels_file_path)
    print(len(labels_gdf), "labels loaded")
    display_labels_on_labeler(labeler, labels_gdf)

## Option 2: Load post-processed detections after model training and inference

In [7]:
# Add polygons from postprocess_detections.py

dissolved = gpd.read_parquet("/Users/ben/EarthGenome/data/costa_rica_pineapple/output/tile_classifier_predictions_1_costa_rica_posw1.0_prob_0.98_postprocess.parquet")

labeler.dissolve_layer = ipyl.GeoJSON(
    data=json.loads(dissolved.geometry.to_json()),
    style={'color': 'blue', 'opacity': 0.5, 'weight': 2, 'fillOpacity': 0.1},
    name='Dissolved Polygons'
)

labeler.map.add_layer(labeler.dissolve_layer)