# Exploring embeddings with t-SNE

This notebook is a quick exploration of the embeddings by using t-SNE to reduce the dimensionality. t-SNE can be thought of a mathematical way to reduce the dimensions of a space while preserving the distances according to some metric. In this case, we will use the cosine distance between the embeddings as the metric. We use the cosine distance since the length of the embedding vector is not important, only the direction.

To ease the exploration, we also pull (on demand) lossy RGB rasters from Mapbox's tile server. We burn the perymeter of the polgyons into the raster to make it easier to see the shape of the polygon. (due to projections and raster edge effects, the polygin is not always a square).

Limitations:
* T-SNE is a stochastic algorithm, so the results will be different each time you run it.
* T-SNE is a lossy algorithm, it will not fully preserve semantic clustering as with all the dimensions.
* t-SNE is a slow algorithm, you might need to use the sampling parameters to reduce the number of embeddings.
* The Mapbox tiles are are NOT using the same data as input file. You will se much higer resolution, and you might not see clouds and not at the same time as the input images.
* The Mapbox tiles are only RGB while the embeddings are made from 13 bands. Some things might look very different but have close similarity in non-RGB bands (like SAR or DEM).



In [None]:
import os
import re
from datetime import datetime

import geopandas as gpd
import pandas as pd
from tqdm import tqdm


In [None]:
def process_file(file_path):
    # Extract metadata from filename and add it as columns to the dataframe
    filename = os.path.basename(file_path)
    match = re.match(r'(\w{5})_(\d{8})_(\d{8})_v(\d{3})\.gpq', filename)
    if not match:
        print(f"Filename {filename} does not match the expected pattern.")
        return
    mgrs, mindate_str, maxdate_str, version_str = match.groups()
    mindate = int(datetime.strptime(mindate_str, '%Y%m%d').timestamp())  # Convert to int timestamp
    maxdate = int(datetime.strptime(maxdate_str, '%Y%m%d').timestamp())  # Convert to int timestamp
    version = int(version_str)


    geodataframe = gpd.read_parquet(path=file_path)

    geodataframe['date'] = geodataframe['date'].apply(
        lambda x: int(datetime.strptime(str(x), '%Y-%m-%d').timestamp())
    )
    geodataframe['embeddings'] = geodataframe['embeddings'].apply(
        lambda x: [float(e) for e in x]
    )

    geodataframe['mgrs'] = mgrs
    geodataframe['mindate'] = mindate
    geodataframe['maxdate'] = maxdate
    geodataframe['version'] = version

    return geodataframe



In [None]:
#Read ALL the files and save it as a pickle.

clay = gpd.GeoDataFrame()
DIRECTORY_PATH = '/home/brunosan/data/Clay/embeddings_e2'
files_to_process = [f for f in os.listdir(DIRECTORY_PATH) if f.endswith(".gpq")]
for filename in tqdm(files_to_process):
    file_path = os.path.join(DIRECTORY_PATH, filename)
    geodataframe = process_file(file_path)
    clay = pd.concat([clay, geodataframe])

clay.to_pickle('clay_embeddings.pkl')

In [None]:
#Use saved file to avoid the above
clay = pd.read_pickle('clay_embeddings.pkl')

In [None]:
clay.head()

In [None]:
import requests
import json
from urllib.parse import quote


def reduce_precision(coordinates, precision=2):
    """
    Reduce the precision of coordinates.
    Needed for Mapbox API requests.
    """
    if isinstance(coordinates[0], tuple) or isinstance(coordinates[0], list):
        return [reduce_precision(coord, precision) for coord in coordinates]
    else:
        return round(coordinates[0], precision), round(coordinates[1], precision)

def get_mapbox_image(polygon, access_token):
    """
    Get a satellite image from Mapbox API.
    polygon: shapely.geometry.Polygon
    access_token: Mapbox access token
    """
    try:
        rounded_coords = reduce_precision(list(polygon.exterior.coords))
        geojson = {
            "type": "FeatureCollection",
            "features": [{
                "id": "0",
                "type": "Feature",
                "properties": {
                    "stroke": "#f00000",
                    "stroke-width": 10,
                    "stroke-opacity": 1,
                    "fill": "#f00000",
                    "fill-opacity": 0
                },
                "geometry": {
                    "type": "Polygon",
                    "coordinates": [rounded_coords]
                }
            }]
        }
        encoded_geojson = quote(json.dumps(geojson))
        min_lon, min_lat = min(rounded_coords, key=lambda x: (x[0], x[1]))
        max_lon, max_lat = max(rounded_coords, key=lambda x: (x[0], x[1]))
        mapbox_url = "https://api.mapbox.com/styles/v1/mapbox/satellite-streets-v12/static/"+\
                     f"geojson({encoded_geojson})"+\
                     f"/[{min_lon},{min_lat},{max_lon},{max_lat}]/"+\
                     f"512x512?access_token={access_token}"

        response = requests.get(mapbox_url)

        if response.status_code == 200:
            return response.content , None
        else:
            return None, f"Error: {response.status_code}, {response.text}"

    except Exception as e:
        return None, f"Error: {str(e)}"

access_token = os.environ['MAPBOX_token']


In [None]:
from pathlib import Path

from PIL import Image


def process_rgb(row):
    """
    From a geoPandas Row,
    return or create an the RGB local image 
    from Mapbox API and save it.
    """
    stem=Path(row['source_url']).stem
    geom=row['geometry']
    rgb_file = Path(f"rgbs/{stem}.jpg")
    if rgb_file.is_file():
        print(f"File {rgb_file} already exists.")
        return rgb_file
    image_data, error = get_mapbox_image(geom, access_token)
    #save the image
    if error is None:
        print(f"Saving {rgb_file}")
        with open(rgb_file, "wb") as image_file:
            image_file.write(image_data)
        return rgb_file
    else:

        return f"Error: {error}"


rgb_file = process_rgb(clay.sample(1).to_crs(epsg=4327).iloc[0])
Image.open(rgb_file)

In [None]:
clay['stem'] = clay['source_url'].apply(lambda x: Path(x).stem)
clay['rgb_file'] = clay['source_url'].apply(lambda x: 'rgbs/'+ Path(x).stem + '.jpg')


# Dimensionality reduction with t-SNE

We use openTSNE as it is faster.
We also use several tricks to speed up the computation, like annealing the perplexity startign with more neighbors and smaller sample size, then reducing the number of neighbors and increasing to the full size.

In [None]:
import numpy as np
import openTSNE
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans

#Adjust the number of samples to reduce the processing time
df=clay#.sample(100000)
df.reset_index(inplace=True)

x = np.array(df['embeddings'].tolist())

#This is the straight forward way to do it, but it takes a lot of time
# t= openTSNE.TSNE(
#     perplexity=100,
#     metric='cosine',
#     n_jobs=-1,
#     verbose=3,
#     n_iter=10000
# ).fit(x)


This code performs a t-SNE dimensionality reduction on a large Earth model dataset. 
Steps:
 1. Randomly samples 10% of the data, computes pairwise affinities with high perplexity (500), and initializes with PCA.
 2. Optimizes a t-SNE embedding for the sample.
 3. Prepares and normalizes embedding for the remaining data based on the sample embedding.
 4. Performs full embedding of the dataset with a lower perplexity (50), optimizing through multiple stages with decreasing "exageration" to shift from making clusters separate, to making cluster inner structure more clear.


In [None]:
#kickstart the clustering with a large perplexity and 10% of the data
indices = np.random.permutation(list(range(x.shape[0])))
reverse = np.argsort(indices)

print("Sampling 10% of the data to speed up the computation")
limit = int(0.1 * x.shape[0])
x_sample, x_rest = x[indices[:limit]], x[indices[limit:]]

sample_affinities = openTSNE.affinity.PerplexityBasedNN(
    x_sample,
    perplexity=500,
    n_jobs=-1,
    metric="cosine",
    verbose=True,
)
print("Using this affinity to compute the PCA of the sample...")
sample_init = openTSNE.initialization.pca(x_sample, random_state=42)
print("Now run the optimization using the same random state")
sample_embedding = openTSNE.TSNE(n_jobs=3-1, verbose=True)\
    .fit(affinities=sample_affinities, initialization=sample_init)
# Now we can embed the rest of the data using the same transformation
rest_init = sample_embedding.prepare_partial(x_rest, k=1, perplexity=1/3)
init_full = np.vstack((sample_embedding, rest_init))[reverse]
init_full = init_full / (np.std(init_full[:, 0]) * 10000)

print("Now run the optimization using the same random state")
aff50 = openTSNE.affinity.PerplexityBasedNN(
    x,
    perplexity=50,
    n_jobs=-1,
    metric="cosine",
    verbose=True
)
# Now we can embed the rest of the data using the same transformation
embedding = openTSNE.TSNEEmbedding(
    init_full,
    aff50,
    n_jobs=-1,
    verbose=True
)
print("Optimizing embedding")
embedding = embedding.optimize(n_iter=500, exaggeration=12)
embedding = embedding.optimize(n_iter=250, exaggeration=10)
embedding = embedding.optimize(n_iter=250, exaggeration=6)
embedding = embedding.optimize(n_iter=250, exaggeration=4)



In [None]:
df['x'] = embedding[:,0]
df['y'] = embedding[:,1]

In [None]:
# Cluster the embeddings.
# We do not cluster the embeddings with high dimensionality, since it is too slow.
# Instead, we cluster the embeddings with 2 dimensions.
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=200, random_state=0, n_init="auto",verbose=2).fit(embedding)

for n_cluster in [10,20,100,500]:
    print(f"Clustering with {n_cluster} clusters")
    kmeans = KMeans(n_clusters=n_cluster, n_init='auto')
    df[str(n_cluster)+'_clusters'] = kmeans.fit_predict(embedding)



In [None]:
#save the dataframe
df.to_pickle('clay_embeddings.pkl')

In [None]:
import geopandas as gpd

df = gpd.read_pickle('clay_embeddings.pkl')

In [None]:
from pathlib import Path

import pandas as pd
import plotly.graph_objs as go
from IPython.display import Image, clear_output, display
from ipywidgets import HBox, Output, VBox

#only use a sample of the data
sample = df#.sample(10000)
#sample.reset_index(inplace=True)

# Create the scatter plot with colored clusters and titles (tooltips) for each dot
scatter_plot = go.FigureWidget([go.Scattergl(x=sample['x'], y=sample['y'], 
                                           mode='markers', 
                                           marker={'color': sample['20_clusters'],
                                                   'size': 2,
                                                   'opacity':0.5},
                                           text=sample['stem'],
                                           hoverinfo='text')])

# Create output widgets for the image and possibly additional data with fixed height
image_output = Output(layout={'height': '512px'})

# Function to handle click events
def display_image(trace, points, selector):
    if points.point_inds:
        idx = points.point_inds[0]
        selected_row = sample.iloc[idx]  # Get the row from the sample DataFrame
        with image_output:
            # Clear the previous image
            clear_output(wait=True)
            print(process_rgb(selected_row))
            img_file = selected_row['rgb_file']
            if Path(img_file).is_file():
                img = Image(img_file)
                display(img)
            else:
                print("No image available.")

# Bind the click event to the display_image function
scatter = scatter_plot.data[0]
scatter.on_click(display_image)

# Layout: side by side - scatter plot on the left, image on the right
container = HBox([scatter_plot, image_output])
display(container)
