#### Clay Embedding Generator

In [1]:
# functions to generate embedding for participant convenience
import pystac_client
import stackstac
import torch
import yaml
import math
import rasterio
import warnings
import os
import joblib

import pandas as pd
import geopandas as gpd
import numpy as np

from shapely.geometry import Point, box
from torchvision import transforms as v2
from box import Box
from rasterio.enums import Resampling
from tqdm import tqdm

from src.model import ClayMAEModule

warnings.filterwarnings("ignore")

STAC_API = "https://earth-search.aws.element84.com/v1"

# Load the model and metadata
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ckpt = "https://clay-model-ckpt.s3.amazonaws.com/v0.5.7/mae_v0.5.7_epoch-13_val-loss-0.3098.ckpt"
torch.set_default_device(device)

torch.cuda.empty_cache()  # Clear GPU cache

model = ClayMAEModule.load_from_checkpoint(
    ckpt, metadata_path="configs/metadata.yaml", shuffle=False, mask_ratio=0
)
model.eval()
model = model.to(device)

metadata = Box(yaml.safe_load(open("configs/metadata.yaml")))

# Function to normalize timestamp
def normalize_timestamp(date):
    week = date.isocalendar().week * 2 * np.pi / 52
    hour = date.hour * 2 * np.pi / 24
    return (math.sin(week), math.cos(week)), (math.sin(hour), math.cos(hour))

# Function to normalize lat/lon
def normalize_latlon(lat, lon):
    lat = lat * np.pi / 180
    lon = lon * np.pi / 180
    return (math.sin(lat), math.cos(lat)), (math.sin(lon), math.cos(lon))

def to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif isinstance(data, list):
        return [to_device(v, device) for v in data]
    return data

def process_point(lon, lat, model, metadata, start_date, end_date, COLLECTION, device):
    if COLLECTION == "landsat-c2-l2":
        os.environ['AWS_REQUEST_PAYER'] = 'requester'
    model.to(device)  # Ensure the model is on the correct device
    catalog = pystac_client.Client.open(STAC_API)
    search = catalog.search(
        collections=[COLLECTION],
        datetime=f"{start_date}/{end_date}",
        bbox=(lon - 1e-5, lat - 1e-5, lon + 1e-5, lat + 1e-5),
        max_items=10,
        query={"eo:cloud_cover": {"lt": 80}},
    )

    all_items = search.get_all_items()
    items = list(all_items)
    if not items:
        return None
    
    items = sorted(items, key=lambda x: x.properties.get('eo:cloud_cover', float('inf')))
    lowest_cloud_item = items[0]

    epsg = lowest_cloud_item.properties["proj:epsg"]

    poidf = gpd.GeoDataFrame(
        pd.DataFrame(),
        crs="EPSG:4326",
        geometry=[Point(lon, lat)],
    ).to_crs(epsg)

    coords = poidf.iloc[0].geometry.coords[0]

    size = 256
    gsd = 10 if COLLECTION == "sentinel-2-l2a" else 30
    bounds = (
        coords[0] - (size * gsd) // 2,
        coords[1] - (size * gsd) // 2,
        coords[0] + (size * gsd) // 2,
        coords[1] + (size * gsd) // 2,
    )

    stack = stackstac.stack(
        lowest_cloud_item,
        bounds=bounds,
        snap_bounds=False,
        epsg=epsg,
        resolution=gsd,
        dtype="float32",
        rescale=False,
        fill_value=0,
        assets=["blue", "green", "red", "nir"],
        resampling=Resampling.nearest,
    )

    stack = stack.compute()

    items = []
    dates = []
    for item in all_items:
        if item.datetime.date() not in dates:
            items.append(item)
            dates.append(item.datetime.date())

    platform = "sentinel-2-l2a" if COLLECTION == "sentinel-2-l2a" else "landsat-c2l1"
    mean = []
    std = []
    waves = []
    for band in stack.band:
        mean.append(metadata[platform].bands.mean[str(band.values)])
        std.append(metadata[platform].bands.std[str(band.values)])
        waves.append(metadata[platform].bands.wavelength[str(band.values)])

    transform = v2.Compose([v2.Normalize(mean=mean, std=std)])

    datetimes = stack.time.values.astype("datetime64[s]").tolist()
    times = [normalize_timestamp(dat) for dat in datetimes]
    week_norm = [dat[0] for dat in times]
    hour_norm = [dat[1] for dat in times]

    latlons = [normalize_latlon(lat, lon)] * len(times)
    lat_norm = [dat[0] for dat in latlons]
    lon_norm = [dat[1] for dat in latlons]

    pixels = torch.from_numpy(stack.data.astype(np.float32)).to(device)
    pixels = transform(pixels)

    batch_size = 16
    num_batches = math.ceil(len(stack) / batch_size)
    
    embeddings_list = []
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(stack))
        
        batch_pixels = pixels[start_idx:end_idx].to(device)
        batch_time = torch.tensor(np.hstack((week_norm, hour_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        batch_latlon = torch.tensor(np.hstack((lat_norm, lon_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        
        batch_datacube = {
            "platform": platform,
            "time": batch_time,
            "latlon": batch_latlon,
            "pixels": batch_pixels,
            "gsd": torch.tensor(stack.gsd.values).to(device),
            "waves": torch.tensor(waves).to(device),
        }

        batch_datacube = to_device(batch_datacube, device)

        try:
            model = model.to(device)

            with torch.no_grad():
                unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
            batch_embeddings = unmsk_patch[:, 0, :].cpu().numpy()
            embeddings_list.append(batch_embeddings)
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"GPU OOM for point ({lon}, {lat}), batch {i+1}/{num_batches}. Trying CPU...")
                device = torch.device("cpu")
                batch_datacube = to_device(batch_datacube, device)
                model = model.to(device)
                with torch.no_grad():
                    unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
                batch_embeddings = unmsk_patch[:, 0, :].numpy()
                embeddings_list.append(batch_embeddings)
            else:
                raise e

    embeddings = np.concatenate(embeddings_list, axis=0)
    return embeddings

def create_embeddings(start_date, end_date, grid, COLLECTION, start_date2=None, end_date2=None):
    torch.cuda.empty_cache()  # Clear GPU cache
    
    # Assuming grid is a GeoDataFrame with the points, get a list of points in EPSG:4326
    points = grid.to_crs("EPSG:4326").geometry.apply(lambda x: (x.x, x.y)).tolist()

    # Store results in a list
    results = []

    # Iterate through the points and process each one
    for i, point in enumerate(tqdm(points)):
        lon, lat = point
        embeddings = process_point(lon, lat, model, metadata, start_date, end_date, COLLECTION, device="cuda")
        
        # Initialize embeddings_new as None
        embeddings_new = None
        if start_date2 is not None and end_date2 is not None:
            embeddings_new = process_point(lon, lat, model, metadata, start_date2, end_date2, COLLECTION, device="cuda")
        
        if embeddings is not None:
            if embeddings_new is not None:
                results.append((lon, lat, embeddings, embeddings_new))
            else:
                results.append((lon, lat, embeddings))

    # Define the columns based on whether embeddings_new is used
    columns = ["lon", "lat", "embeddings"]
    if start_date2 is not None and end_date2 is not None:
        columns.append("embeddings_new")

    # Create a DataFrame from the results
    df = pd.DataFrame(results, columns=columns)

    # Convert to a GeoDataFrame
    gdf_results = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))

    return gdf_results


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


#### Utility Functions

In [2]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
import pyproj

# Detect if GPU is available
device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

class NeuralNetworkModel:
    def __init__(self, model_type, input_shape, layers, dropout_rate, learning_rate, num_classes=None, device='/cpu:0'):
        self.model_type = model_type
        self.input_shape = input_shape
        self.layers = layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.num_classes = num_classes
        self.device = device
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.input_shape,)))
        for layer_size in self.layers:
            model.add(Dense(layer_size, activation='relu'))
            model.add(Dropout(self.dropout_rate))

        if self.model_type == 'regression':
            model.add(Dense(1, activation='linear'))
        elif self.model_type == 'multi_regression':
            model.add(Dense(2, activation='linear'))
        elif self.model_type == 'binary_classifier':
            model.add(Dense(1, activation='sigmoid'))
        elif self.model_type == 'multi_classifier':
            model.add(Dense(self.num_classes, activation='softmax'))
        
        return model

    def compile_model(self):
        optimizer = Adam(learning_rate=self.learning_rate)
        if self.model_type == 'regression':
            self.model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])
        elif self.model_type == 'binary_classifier':
            self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        elif self.model_type == 'multi_classifier':
            self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    def train_model(self, X_train, y_train, epochs=20, batch_size=32, validation_split=0.2):
        with tf.device(self.device):
            history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
        return history

    def evaluate_model(self, X_test, y_test):
        with tf.device(self.device):
            results = self.model.evaluate(X_test, y_test)
        return results

    def predict(self, X_test):
        with tf.device(self.device):
            predictions = self.model.predict(X_test)
        
        if self.model_type == 'binary_classifier':
            return (predictions > 0.5).astype(int).flatten()
        elif self.model_type == 'multi_classifier':
            return np.argmax(predictions, axis=1)
        return predictions

    def calculate_metrics(self, y_test, predictions):
        if self.model_type == 'regression':
            return np.sqrt(mean_squared_error(y_test, predictions, multioutput='raw_values'))
        elif self.model_type == 'binary_classifier':
            return accuracy_score(y_test, predictions)
        elif self.model_type == 'multi_classifier':
            return f1_score(np.argmax(y_test, axis=1), predictions, average='weighted')
    
    def save_model(self, filename):
        self.model.save(filename)

    @classmethod
    def load_model(cls, filename, model_type, input_shape, num_classes=None, device='/cpu:0'):
        loaded_model = tf.keras.models.load_model(filename)
        nn = cls(model_type, input_shape, [], 0, 0, num_classes, device)  # Dummy values for layers, dropout_rate, and learning_rate
        nn.model = loaded_model
        return nn

def get_utm_zone(longitude):
    return int((longitude + 180) / 6) + 1

def gdf_preprocess(filepath, COLLECTION):
    embedding_width = 2560 if COLLECTION == "sentinel-2-l2a" else 7680
    gdf = gpd.read_file(filepath)

    # Get the bounds of the geometry
    minx, miny, maxx, maxy = gdf.geometry.bounds.iloc[0]

    # Calculate UTM zone
    utm_zone = get_utm_zone(minx)

    # Get the corresponding EPSG code for the UTM zone using pyproj
    utm_crs = pyproj.CRS(f"+proj=utm +zone={utm_zone} +datum=WGS84")
    epsg_code = utm_crs.to_epsg()

    # Reproject the GeoDataFrame to the chosen EPSG code
    gdf = gdf.to_crs(epsg=epsg_code)

    # Create a grid of points embedding_width meters apart
    x = np.arange(gdf.total_bounds[0], gdf.total_bounds[2], embedding_width)
    y = np.arange(gdf.total_bounds[1], gdf.total_bounds[3], embedding_width)
    xx, yy = np.meshgrid(x, y)
    points = np.vstack([xx.ravel(), yy.ravel()]).T

    # Create GeoDataFrame from points
    grid = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 0], points[:, 1], crs=gdf.crs))
    
    # Calculate bounding boxes around each point
    grid['bounding_box'] = grid.geometry.apply(lambda point: box(
        point.x - embedding_width / 2,
        point.y - embedding_width / 2,
        point.x + embedding_width / 2,
        point.y + embedding_width / 2
    ))

    return grid

def final_grid(filepath, gsd, gdf_data):
    gdf = gpd.read_file(filepath)

    # Get the bounds of the geometry
    minx, miny, maxx, maxy = gdf.geometry.bounds.iloc[0]

    # Calculate UTM zone
    utm_zone = get_utm_zone(minx)

    # Get the corresponding EPSG code for the UTM zone using pyproj
    utm_crs = pyproj.CRS(f"+proj=utm +zone={utm_zone} +datum=WGS84")
    epsg_code = utm_crs.to_epsg()

    # Reproject the GeoDataFrame to the chosen EPSG code
    gdf = gdf.to_crs(epsg=epsg_code)

    # Create a grid of points 5120m apart
    x = np.arange(gdf.total_bounds[0], gdf.total_bounds[2], gsd)
    y = np.arange(gdf.total_bounds[1], gdf.total_bounds[3], gsd)
    xx, yy = np.meshgrid(x, y)
    points = np.vstack([xx.ravel(), yy.ravel()]).T

    grid = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 0], points[:, 1], crs=gdf.crs))

    # Ensure both GeoDataFrames have the same CRS
    grid = grid.to_crs(4326)
    gdf_data = gdf_data.to_crs(4326)

    # Perform a spatial join to assign 'pred' values from gdf to gdf_final based on geometry intersection
    gdf_final = gpd.sjoin(
        grid, 
        gdf_data[['geometry'] + gdf_data.filter(like='pred').columns.tolist()], 
        how='left', 
        op='intersects'
    )

    # Drop the columns that were added during the join and rename the 'pred' column if necessary
    gdf_final = gdf_final.drop(columns=['index_right'])

    return(gdf_final)

def NN_preds(task_id, model_type, gdf):
    # Download model (as pickle file or similar)
    loaded_nn = NeuralNetworkModel.load_model(f'models/task_{task_id}_model.h5', model_type=model_type, input_shape=768, num_classes=11, device=device)

    # perform inference
    new_data = np.squeeze(gdf['embeddings'].tolist())
    new_data = pd.DataFrame(new_data)  # Ensure the new data is in DataFrame format

    # Standardize the new data using the saved scaler
    if os.path.exists(f'models/{task_id}_scaler.joblib'):
        scaler = joblib.load(f'models/{task_id}_scaler.joblib')
        new_data = scaler.transform(new_data)

    # Make predictions
    new_predictions = loaded_nn.predict(new_data)

    # Load the label encoder
    if os.path.exists(f'models/{task_id}_label_encoder.joblib'):
        label_encoder = joblib.load(f'models/{task_id}_label_encoder.joblib')
        gdf['pred'] = label_encoder.inverse_transform(new_predictions)
    else:
        if model_type == 'multi_regression':
            gdf[['pred_maize','pred_wheat']] = new_predictions
        else:
            gdf['pred'] = new_predictions
    return(gdf)

2024-08-18 18:55:46.107620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-18 18:55:46.122478: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-18 18:55:46.126839: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-18 18:55:46.137342: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1724007347.654806   16518 cuda_executor.c

#### Imports

In [3]:
import pandas as pd
import geopandas as gpd
import os

#### Submission 

In [4]:
submission = {}

# Participant data to fill
submission['eligibility'] = 2

#### Task: 1 Submission

In [5]:
COLLECTION="sentinel-2-l2a"

# Load the GeoJSON file
geojson_path = 'test_data/challenge_1_bb.geojson'
grid = gdf_preprocess(geojson_path, COLLECTION)

# Create embeddings over the defined grid
gdf = create_embeddings(start_date="2022-01-01", end_date="2022-12-31", grid=grid, COLLECTION=COLLECTION)

# Download model (as pickle file or similar)
gdf = NN_preds(task_id=1, model_type="multi_classifier", gdf=gdf)
gdf['geometry'] = grid['bounding_box']

# # assert format / shape, SAVE AS CSV
gdf_final = final_grid(geojson_path, gsd=10, gdf_data=gdf)

gdf_final.to_csv("task_1.csv")

# # # calculate score
# task_1_score = 0
# submission['task_1_score'] = task_1_score

100%|██████████| 336/336 [04:33<00:00,  1.23it/s]
I0000 00:00:1724007621.627897   16518 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724007621.629836   16518 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724007621.631278   16518 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724007621.632682   16518 cuda_executo

[1m 1/11[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m28s[0m 3s/step

I0000 00:00:1724007624.752530   16761 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 113ms/step


#### Task: 2 Submission

In [6]:
# ditto
task_2_score = 0
submission['task_2_score'] = task_2_score

#### Task: 3 Submission

In [7]:
COLLECTION="sentinel-2-l2a"

# Load the GeoJSON file
geojson_path = 'test_data/challenge_3_bb.geojson'
grid = gdf_preprocess(geojson_path, COLLECTION=COLLECTION)

# Create embeddings over the defined grid
gdf = create_embeddings(start_date="2022-01-01", end_date="2022-12-31", grid=grid, COLLECTION=COLLECTION)

# Download model (as pickle file or similar)
gdf = NN_preds(task_id=3, model_type="regression", gdf=gdf)
gdf['geometry'] = grid['bounding_box']

# # assert format / shape, SAVE AS CSV
gdf_final = final_grid(geojson_path, gsd=10, gdf_data=gdf)

gdf_final.to_csv("task_3.csv")

# task_3_score = 0
# submission['task_3_score'] = task_3_score

  0%|          | 0/336 [00:00<?, ?it/s]

100%|██████████| 336/336 [04:48<00:00,  1.17it/s]


[1m 1/11[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m9s[0m 985ms/step




[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 116ms/step


#### Task: 4 Submission

In [8]:
COLLECTION="landsat-c2-l2"

# Load the GeoJSON file
geojson_path = 'test_data/challenge_4_bb.geojson'
grid = gdf_preprocess(geojson_path, COLLECTION=COLLECTION)

# Create embeddings over the defined grid
gdf = create_embeddings(start_date="2013-01-01", end_date="2013-12-31", grid=grid, COLLECTION=COLLECTION)

# Download model (as pickle file or similar) #FIX FOR TWO OUTPUTS
gdf = NN_preds(task_id=4, model_type="multi_regression", gdf=gdf)
gdf['geometry'] = grid['bounding_box']

# # assert format / shape, SAVE AS CSV
gdf_final = final_grid(geojson_path, gsd=10, gdf_data=gdf)

gdf_final.to_csv("task_4.csv")

# task_4_score = 0
# submission['task_4_score'] = task_4_score

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:28<00:00,  1.06it/s]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


#### Task: 5 Submission

In [9]:
COLLECTION="sentinel-2-l2a"

# Load the GeoJSON file
geojson_path = 'test_data/challenge_5_bb.geojson'
grid = gdf_preprocess(geojson_path, COLLECTION=COLLECTION)

# Create embeddings over the defined grid
gdf = create_embeddings(start_date="2018-08-01", end_date="2018-09-30", start_date2="2019-08-01", end_date2="2019-09-30", grid=grid, COLLECTION=COLLECTION) 
gdf['embeddings_delta'] = gdf['embeddings_new'] - gdf['embeddings']

# Download model (as pickle file or similar)
gdf = NN_preds(task_id=5, model_type="binary_classifier", gdf=gdf)
gdf['geometry'] = grid['bounding_box']

# # assert format / shape, SAVE AS CSV
gdf_final = final_grid(geojson_path, gsd=10, gdf_data=gdf)

gdf_final.to_csv("task_5.csv")

# task_5_score = 0
# submission['task_5_score'] = task_5_score

  0%|          | 0/399 [00:00<?, ?it/s]

100%|██████████| 399/399 [11:55<00:00,  1.79s/it]


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 107ms/step


#### Task: 6 Submission

In [11]:
COLLECTION="sentinel-2-l2a"

# Load the GeoJSON file
geojson_path = 'test_data/challenge_6_bb.geojson'
grid = gdf_preprocess(geojson_path, COLLECTION=COLLECTION)

# Create embeddings over the defined grid
gdf = create_embeddings(start_date="2019-01-01", end_date="2019-12-31", start_date2="2020-01-01", end_date2="2020-12-31", grid=grid, COLLECTION=COLLECTION) 
gdf['embeddings_delta'] = gdf['embeddings_new'] - gdf['embeddings']

# Download model (as pickle file or similar)
gdf = NN_preds(task_id=6, model_type="regression", gdf=gdf)
gdf['geometry'] = grid['bounding_box']

# # assert format / shape, SAVE AS CSV
gdf_final = final_grid(geojson_path, gsd=10, gdf_data=gdf)

gdf_final.to_csv("task_6.csv")

# task_6_score = 0
# submission['task_6_score'] = task_6_score

  0%|          | 0/378 [00:00<?, ?it/s]

100%|██████████| 378/378 [12:23<00:00,  1.97s/it]


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 97ms/step


#### Task: 7 Submission

In [None]:
# ditto
task_7_score = 0
submission['task_7_score'] = task_7_score

#### Output submissions

In [5]:
pd.DataFrame([submission]).to_csv('output.csv', index=False)