diff --git a/.gitignore b/.gitignore index 77021954..b5f75436 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,9 @@ docs/_build/ # Mac OS-specific storage files .DS_Store + datadisk/ +docs/rgbs/ +*.pkl +*.parquet +*.csv diff --git a/docs/explore_embeddings.ipynb b/docs/explore_embeddings.ipynb new file mode 100644 index 00000000..a3bfcf25 --- /dev/null +++ b/docs/explore_embeddings.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exploring embeddings with t-SNE\n", + "\n", + "This notebook is a quick exploration of the embeddings by using t-SNE to reduce the dimensionality. t-SNE can be thought of a mathematical way to reduce the dimensions of a space while preserving the distances according to some metric. In this case, we will use the cosine distance between the embeddings as the metric. We use the cosine distance since the length of the embedding vector is not important, only the direction.\n", + "\n", + "To ease the exploration, we also pull (on demand) lossy RGB rasters from Mapbox's tile server. We burn the perymeter of the polgyons into the raster to make it easier to see the shape of the polygon. (due to projections and raster edge effects, the polygin is not always a square).\n", + "\n", + "Limitations:\n", + "* T-SNE is a stochastic algorithm, so the results will be different each time you run it.\n", + "* T-SNE is a lossy algorithm, it will not fully preserve semantic clustering as with all the dimensions.\n", + "* t-SNE is a slow algorithm, you might need to use the sampling parameters to reduce the number of embeddings.\n", + "* The Mapbox tiles are are NOT using the same data as input file. You will se much higer resolution, and you might not see clouds and not at the same time as the input images.\n", + "* The Mapbox tiles are only RGB while the embeddings are made from 13 bands. Some things might look very different but have close similarity in non-RGB bands (like SAR or DEM).\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "from datetime import datetime\n", + "\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_file(file_path):\n", + " # Extract metadata from filename and add it as columns to the dataframe\n", + " filename = os.path.basename(file_path)\n", + " match = re.match(r\"(\\w{5})_(\\d{8})_(\\d{8})_v(\\d{3})\\.gpq\", filename)\n", + " if not match:\n", + " print(f\"Filename {filename} does not match the expected pattern.\")\n", + " return\n", + " mgrs, mindate_str, maxdate_str, version_str = match.groups()\n", + " mindate = int(\n", + " datetime.strptime(mindate_str, \"%Y%m%d\").timestamp()\n", + " ) # Convert to int timestamp\n", + " maxdate = int(\n", + " datetime.strptime(maxdate_str, \"%Y%m%d\").timestamp()\n", + " ) # Convert to int timestamp\n", + " version = int(version_str)\n", + "\n", + " geodataframe = gpd.read_parquet(path=file_path)\n", + "\n", + " geodataframe[\"date\"] = geodataframe[\"date\"].apply(\n", + " lambda x: int(datetime.strptime(str(x), \"%Y-%m-%d\").timestamp())\n", + " )\n", + " geodataframe[\"embeddings\"] = geodataframe[\"embeddings\"].apply(\n", + " lambda x: [float(e) for e in x]\n", + " )\n", + "\n", + " geodataframe[\"mgrs\"] = mgrs\n", + " geodataframe[\"mindate\"] = mindate\n", + " geodataframe[\"maxdate\"] = maxdate\n", + " geodataframe[\"version\"] = version\n", + "\n", + " return geodataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read ALL the files and save it as a pickle.\n", + "\n", + "clay = gpd.GeoDataFrame()\n", + "DIRECTORY_PATH = \"/home/brunosan/data/Clay/embeddings_e2\"\n", + "files_to_process = [f for f in os.listdir(DIRECTORY_PATH) if f.endswith(\".gpq\")]\n", + "for filename in tqdm(files_to_process):\n", + " file_path = os.path.join(DIRECTORY_PATH, filename)\n", + " geodataframe = process_file(file_path)\n", + " clay = pd.concat([clay, geodataframe])\n", + "\n", + "clay.to_pickle(\"clay_embeddings.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use saved file to avoid the above\n", + "clay = pd.read_pickle(\"clay_embeddings.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clay.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from urllib.parse import quote\n", + "\n", + "import requests\n", + "\n", + "\n", + "def reduce_precision(coordinates, precision=2):\n", + " \"\"\"\n", + " Reduce the precision of coordinates.\n", + " Needed for Mapbox API requests.\n", + " \"\"\"\n", + " if isinstance(coordinates[0], (list, tuple)):\n", + " return [reduce_precision(coord, precision) for coord in coordinates]\n", + " else:\n", + " return round(coordinates[0], precision), round(coordinates[1], precision)\n", + "\n", + "\n", + "def get_mapbox_image(polygon, access_token):\n", + " \"\"\"\n", + " Get a satellite image from Mapbox API.\n", + " polygon: shapely.geometry.Polygon\n", + " access_token: Mapbox access token\n", + " \"\"\"\n", + " try:\n", + " rounded_coords = reduce_precision(list(polygon.exterior.coords))\n", + " geojson = {\n", + " \"type\": \"FeatureCollection\",\n", + " \"features\": [\n", + " {\n", + " \"id\": \"0\",\n", + " \"type\": \"Feature\",\n", + " \"properties\": {\n", + " \"stroke\": \"#f00000\",\n", + " \"stroke-width\": 10,\n", + " \"stroke-opacity\": 1,\n", + " \"fill\": \"#f00000\",\n", + " \"fill-opacity\": 0,\n", + " },\n", + " \"geometry\": {\"type\": \"Polygon\", \"coordinates\": [rounded_coords]},\n", + " }\n", + " ],\n", + " }\n", + " encoded_geojson = quote(json.dumps(geojson))\n", + " min_lon, min_lat = min(rounded_coords, key=lambda x: (x[0], x[1]))\n", + " max_lon, max_lat = max(rounded_coords, key=lambda x: (x[0], x[1]))\n", + " mapbox_url = (\n", + " \"https://api.mapbox.com/styles/v1/mapbox/satellite-streets-v12/static/\"\n", + " + f\"geojson({encoded_geojson})\"\n", + " + f\"/[{min_lon},{min_lat},{max_lon},{max_lat}]/\"\n", + " + f\"512x512?access_token={access_token}\"\n", + " )\n", + "\n", + " response = requests.get(mapbox_url)\n", + "\n", + " all_ok = 200\n", + " if response.status_code == all_ok:\n", + " return response.content, None\n", + " else:\n", + " return None, f\"Error: {response.status_code}, {response.text}\"\n", + "\n", + " except Exception as e:\n", + " return None, f\"Error: {str(e)}\"\n", + "\n", + "\n", + "access_token = os.environ[\"MAPBOX_token\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from PIL import Image\n", + "\n", + "\n", + "def process_rgb(row):\n", + " \"\"\"\n", + " From a geoPandas Row,\n", + " return or create an the RGB local image\n", + " from Mapbox API and save it.\n", + " \"\"\"\n", + " stem = Path(row[\"source_url\"]).stem\n", + " geom = row[\"geometry\"]\n", + " rgb_file = Path(f\"rgbs/{stem}.jpg\")\n", + " if rgb_file.is_file():\n", + " print(f\"File {rgb_file} already exists.\")\n", + " return rgb_file\n", + " image_data, error = get_mapbox_image(geom, access_token)\n", + " # save the image\n", + " if error is None:\n", + " print(f\"Saving {rgb_file}\")\n", + " with open(rgb_file, \"wb\") as image_file:\n", + " image_file.write(image_data)\n", + " return rgb_file\n", + " else:\n", + " return f\"Error: {error}\"\n", + "\n", + "\n", + "rgb_file = process_rgb(clay.sample(1).to_crs(epsg=4327).iloc[0])\n", + "Image.open(rgb_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clay[\"stem\"] = clay[\"source_url\"].apply(lambda x: Path(x).stem)\n", + "clay[\"rgb_file\"] = clay[\"source_url\"].apply(lambda x: \"rgbs/\" + Path(x).stem + \".jpg\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dimensionality reduction with t-SNE\n", + "\n", + "We use openTSNE as it is faster.\n", + "We also use several tricks to speed up the computation, like annealing the perplexity startign with more neighbors and smaller sample size, then reducing the number of neighbors and increasing to the full size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import openTSNE\n", + "\n", + "# Adjust the number of samples to reduce the processing time\n", + "df = clay # .sample(100000)\n", + "df.reset_index(inplace=True)\n", + "\n", + "x = np.array(df[\"embeddings\"].tolist())\n", + "\n", + "# This is the straight forward way to do it, but it takes a lot of time\n", + "# t= openTSNE.TSNE(\n", + "# perplexity=100,\n", + "# metric='cosine',\n", + "# n_jobs=-1,\n", + "# verbose=3,\n", + "# n_iter=10000\n", + "# ).fit(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This code performs a t-SNE dimensionality reduction on a large Earth model dataset. \n", + "Steps:\n", + " 1. Randomly samples 10% of the data, computes pairwise affinities with high perplexity (500), and initializes with PCA.\n", + " 2. Optimizes a t-SNE embedding for the sample.\n", + " 3. Prepares and normalizes embedding for the remaining data based on the sample embedding.\n", + " 4. Performs full embedding of the dataset with a lower perplexity (50), optimizing through multiple stages with decreasing \"exageration\" to shift from making clusters separate, to making cluster inner structure more clear.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# kickstart the clustering with a large perplexity and 10% of the data\n", + "indices = np.random.permutation(list(range(x.shape[0]))) # noqa: NPY002\n", + "reverse = np.argsort(indices)\n", + "\n", + "print(\"Sampling 10% of the data to speed up the computation\")\n", + "limit = int(0.1 * x.shape[0])\n", + "x_sample, x_rest = x[indices[:limit]], x[indices[limit:]]\n", + "\n", + "sample_affinities = openTSNE.affinity.PerplexityBasedNN(\n", + " x_sample,\n", + " perplexity=500,\n", + " n_jobs=-1,\n", + " metric=\"cosine\",\n", + " verbose=True,\n", + ")\n", + "print(\"Using this affinity to compute the PCA of the sample...\")\n", + "sample_init = openTSNE.initialization.pca(x_sample, random_state=42)\n", + "print(\"Now run the optimization using the same random state\")\n", + "sample_embedding = openTSNE.TSNE(n_jobs=3 - 1, verbose=True).fit(\n", + " affinities=sample_affinities, initialization=sample_init\n", + ")\n", + "# Now we can embed the rest of the data using the same transformation\n", + "rest_init = sample_embedding.prepare_partial(x_rest, k=1, perplexity=1 / 3)\n", + "init_full = np.vstack((sample_embedding, rest_init))[reverse]\n", + "init_full = init_full / (np.std(init_full[:, 0]) * 10000)\n", + "\n", + "print(\"Now run the optimization using the same random state\")\n", + "aff50 = openTSNE.affinity.PerplexityBasedNN(\n", + " x, perplexity=50, n_jobs=-1, metric=\"cosine\", verbose=True\n", + ")\n", + "# Now we can embed the rest of the data using the same transformation\n", + "embedding = openTSNE.TSNEEmbedding(init_full, aff50, n_jobs=-1, verbose=True)\n", + "print(\"Optimizing embedding\")\n", + "embedding = embedding.optimize(n_iter=500, exaggeration=12)\n", + "embedding = embedding.optimize(n_iter=250, exaggeration=10)\n", + "embedding = embedding.optimize(n_iter=250, exaggeration=6)\n", + "embedding = embedding.optimize(n_iter=250, exaggeration=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"x\"] = embedding[:, 0]\n", + "df[\"y\"] = embedding[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Cluster the embeddings.\n", + "# We do not cluster the embeddings with high dimensionality, since it is too slow.\n", + "# Instead, we cluster the embeddings with 2 dimensions.\n", + "from sklearn.cluster import KMeans\n", + "\n", + "for n_cluster in [10, 20, 100, 500]:\n", + " print(f\"Clustering with {n_cluster} clusters\")\n", + " k_means = KMeans(n_clusters=n_cluster, n_init=\"auto\")\n", + " df[str(n_cluster) + \"_clusters\"] = k_means.fit_predict(embedding)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save the dataframe\n", + "df.to_pickle(\"clay_embeddings.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "\n", + "df = gpd.read_pickle(\"clay_embeddings.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.graph_objs as go\n", + "from IPython.display import Image, clear_output, display\n", + "from ipywidgets import HBox, Output\n", + "\n", + "# only use a sample of the data\n", + "sample = df # .sample(10000)\n", + "# sample.reset_index(inplace=True)\n", + "\n", + "# Create the scatter plot with colored clusters and titles (tooltips) for each dot\n", + "scatter_plot = go.FigureWidget(\n", + " [\n", + " go.Scattergl(\n", + " x=sample[\"x\"],\n", + " y=sample[\"y\"],\n", + " mode=\"markers\",\n", + " marker={\"color\": sample[\"20_clusters\"], \"size\": 2, \"opacity\": 0.5},\n", + " text=sample[\"stem\"],\n", + " hoverinfo=\"text\",\n", + " )\n", + " ]\n", + ")\n", + "\n", + "# Create output widgets for the image and possibly additional data with fixed height\n", + "image_output = Output(layout={\"height\": \"512px\"})\n", + "\n", + "\n", + "# Function to handle click events\n", + "def display_image(trace, points, selector):\n", + " if points.point_inds:\n", + " idx = points.point_inds[0]\n", + " selected_row = sample.iloc[idx] # Get the row from the sample DataFrame\n", + " with image_output:\n", + " # Clear the previous image\n", + " clear_output(wait=True)\n", + " print(process_rgb(selected_row))\n", + " img_file = selected_row[\"rgb_file\"]\n", + " if Path(img_file).is_file():\n", + " img = Image(img_file)\n", + " display(img)\n", + " else:\n", + " print(\"No image available.\")\n", + "\n", + "\n", + "# Bind the click event to the display_image function\n", + "scatter = scatter_plot.data[0]\n", + "scatter.on_click(display_image)\n", + "\n", + "# Layout: side by side - scatter plot on the left, image on the right\n", + "container = HBox([scatter_plot, image_output])\n", + "display(container)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "claymodel", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/model_embeddings.md b/docs/model_embeddings.md index 35b03ea3..b2c66666 100644 --- a/docs/model_embeddings.md +++ b/docs/model_embeddings.md @@ -142,3 +142,8 @@ Further reading: - https://guide.cloudnativegeo.org/geoparquet - https://cloudnativegeo.org/blog/2023/10/the-geoparquet-ecosystem-at-1.0.0 ``` + +## Exploring the embeddings + +Given the high dimensionality of the embeddings, it is often useful to reduce them to 2 dimensions. +We have created [this sample notebook](explore_embeddings.ipynb) that uses T-SNE and Mapbox to explore and visualize the embedding's space.