In [14]:
import geopandas as gpd

# Load the GeoJSON file
geojson_path = 'train_data/challenge_6_bb_TRAIN.geojson'
gdf = gpd.read_file(geojson_path)
gdf

Unnamed: 0,geometry
0,"POLYGON ((-124.54615 40.86785, -124.54615 38.1..."


In [15]:
import pyproj

def get_utm_zone(longitude):
    return int((longitude + 180) / 6) + 1

# Get the bounds of the geometry
minx, miny, maxx, maxy = gdf.geometry.bounds.iloc[0]

# Calculate UTM zone
utm_zone = get_utm_zone(minx)

# Check for a suitable projection using pyproj
proj = pyproj.Proj(proj='utm', zone=utm_zone, ellps='WGS84')

# Get the corresponding EPSG code for the UTM zone using pyproj
utm_crs = pyproj.CRS(f"+proj=utm +zone={utm_zone} +datum=WGS84")
epsg_code = utm_crs.to_epsg()

# Reproject the GeoDataFrame to the chosen EPSG code
gdf = gdf.to_crs(epsg=epsg_code)
gdf

Unnamed: 0,geometry
0,"POLYGON ((369704.958 4525237.457, 364581.501 4..."


In [16]:
import numpy as np

# Create a grid of points 5120m apart
x = np.arange(gdf.total_bounds[0], gdf.total_bounds[2], 2560)
y = np.arange(gdf.total_bounds[1], gdf.total_bounds[3], 2560)
xx, yy = np.meshgrid(x, y)
points = np.vstack([xx.ravel(), yy.ravel()]).T

grid = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 0], points[:, 1], crs=gdf.crs))
grid

Unnamed: 0,geometry
0,POINT (364581.501 4226949.476)
1,POINT (367141.501 4226949.476)
2,POINT (369701.501 4226949.476)
3,POINT (372261.501 4226949.476)
4,POINT (374821.501 4226949.476)
...,...
14620,POINT (671781.501 4523909.476)
14621,POINT (674341.501 4523909.476)
14622,POINT (676901.501 4523909.476)
14623,POINT (679461.501 4523909.476)


In [17]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import pystac_client
import stackstac
import torch
from torchvision import transforms as v2
from box import Box
import yaml
import math
from rasterio.enums import Resampling
from tqdm import tqdm
import rasterio
import warnings
import os
import numpy as np
import rioxarray  # Make sure to import rioxarray to extend xarray

from src.model import ClayMAEModule

warnings.filterwarnings("ignore")

STAC_API = "https://earth-search.aws.element84.com/v1"
COLLECTION = "sentinel-2-l2a"

# Load the model and metadata
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ckpt = "https://clay-model-ckpt.s3.amazonaws.com/v0.5.7/mae_v0.5.7_epoch-13_val-loss-0.3098.ckpt"
torch.set_default_device(device)

torch.cuda.empty_cache()  # Clear GPU cache

# Assuming grid is a GeoDataFrame with the points
points = grid.to_crs("EPSG:4326").geometry.apply(lambda x: (x.x, x.y)).tolist()

model = ClayMAEModule.load_from_checkpoint(
    ckpt, metadata_path="configs/metadata.yaml", shuffle=False, mask_ratio=0
)
model.eval()
model = model.to(device)

metadata = Box(yaml.safe_load(open("configs/metadata.yaml")))

# Function to normalize timestamp
def normalize_timestamp(date):
    week = date.isocalendar().week * 2 * np.pi / 52
    hour = date.hour * 2 * np.pi / 24
    return (math.sin(week), math.cos(week)), (math.sin(hour), math.cos(hour))

# Function to normalize lat/lon
def normalize_latlon(lat, lon):
    lat = lat * np.pi / 180
    lon = lon * np.pi / 180
    return (math.sin(lat), math.cos(lat)), (math.sin(lon), math.cos(lon))

def to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif isinstance(data, list):
        return [to_device(v, device) for v in data]
    return data

def process_point(lon, lat, model, metadata, year, device, j):
    model.to(device)  # Ensure the model is on the correct device
    catalog = pystac_client.Client.open(STAC_API)
    search = catalog.search(
        collections=[COLLECTION],
        datetime=f"{year}-01-01/{year}-12-31",
        bbox=(lon - 1e-5, lat - 1e-5, lon + 1e-5, lat + 1e-5),
        max_items=10,
        query={"eo:cloud_cover": {"lt": 80}},
    )

    all_items = search.get_all_items()
    items = list(all_items)
    if not items:
        return None
    
    items = sorted(items, key=lambda x: x.properties.get('eo:cloud_cover', float('inf')))
    lowest_cloud_item = items[0]

    epsg = lowest_cloud_item.properties["proj:epsg"]

    poidf = gpd.GeoDataFrame(
        pd.DataFrame(),
        crs="EPSG:4326",
        geometry=[Point(lon, lat)],
    ).to_crs(epsg)

    coords = poidf.iloc[0].geometry.coords[0]

    size = 256
    gsd = 10
    bounds = (
        coords[0] - (size * gsd) // 2,
        coords[1] - (size * gsd) // 2,
        coords[0] + (size * gsd) // 2,
        coords[1] + (size * gsd) // 2,
    )

    stack = stackstac.stack(
        lowest_cloud_item,
        bounds=bounds,
        snap_bounds=False,
        epsg=epsg,
        resolution=gsd,
        dtype="float32",
        rescale=False,
        fill_value=0,
        assets=["blue", "green", "red", "nir"],
        resampling=Resampling.nearest,
    )

    stack = stack.compute()

    items = []
    dates = []
    for item in all_items:
        if item.datetime.date() not in dates:
            items.append(item)
            dates.append(item.datetime.date())

    platform = "sentinel-2-l2a"
    mean = []
    std = []
    waves = []
    for band in stack.band:
        mean.append(metadata[platform].bands.mean[str(band.values)])
        std.append(metadata[platform].bands.std[str(band.values)])
        waves.append(metadata[platform].bands.wavelength[str(band.values)])

    transform = v2.Compose([v2.Normalize(mean=mean, std=std)])

    datetimes = stack.time.values.astype("datetime64[s]").tolist()
    times = [normalize_timestamp(dat) for dat in datetimes]
    week_norm = [dat[0] for dat in times]
    hour_norm = [dat[1] for dat in times]

    latlons = [normalize_latlon(lat, lon)] * len(times)
    lat_norm = [dat[0] for dat in latlons]
    lon_norm = [dat[1] for dat in latlons]

    pixels = torch.from_numpy(stack.data.astype(np.float32)).to(device)
    pixels = transform(pixels)

    batch_size = 16
    num_batches = math.ceil(len(stack) / batch_size)
    
    embeddings_list = []
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(stack))
        
        batch_pixels = pixels[start_idx:end_idx].to(device)
        batch_time = torch.tensor(np.hstack((week_norm, hour_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        batch_latlon = torch.tensor(np.hstack((lat_norm, lon_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        
        batch_datacube = {
            "platform": platform,
            "time": batch_time,
            "latlon": batch_latlon,
            "pixels": batch_pixels,
            "gsd": torch.tensor(stack.gsd.values).to(device),
            "waves": torch.tensor(waves).to(device),
        }

        batch_datacube = to_device(batch_datacube, device)

        try:
            model = model.to(device)

            with torch.no_grad():
                unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
            batch_embeddings = unmsk_patch[:, 0, :].cpu().numpy()
            embeddings_list.append(batch_embeddings)
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"GPU OOM for point ({lon}, {lat}), batch {i+1}/{num_batches}. Trying CPU...")
                device = torch.device("cpu")
                batch_datacube = to_device(batch_datacube, device)
                model = model.to(device)
                with torch.no_grad():
                    unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
                batch_embeddings = unmsk_patch[:, 0, :].numpy()
                embeddings_list.append(batch_embeddings)
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            else:
                raise e

    embeddings = np.concatenate(embeddings_list, axis=0)
    return embeddings

# Initialize an empty dictionary to store results for both years
results_dict = {"lon": [], "lat": [], "embeddings_2019": [], "embeddings_2020": []}

# Specify the years for the datetime range in the search
years = [2019, 2020]

# Iterate through the points and process each one for both years
for i, point in enumerate(tqdm(points)):
    lon, lat = point
    results_dict["lon"].append(lon)
    results_dict["lat"].append(lat)
    
    for year in years:
        embeddings = process_point(lon, lat, model, metadata, year, device, i)
        if embeddings is not None:
            results_dict[f"embeddings_{year}"].append(embeddings)
        else:
            results_dict[f"embeddings_{year}"].append(None)

# Create a DataFrame from the results
df = pd.DataFrame(results_dict)

# Convert to a GeoDataFrame
gdf_results = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))

# Output the resulting GeoDataFrame
gdf_results.head()


100%|██████████| 14625/14625 [8:02:38<00:00,  1.98s/it]  


Unnamed: 0,lon,lat,embeddings_2019,embeddings_2020,geometry
0,-124.546147,38.180299,"[[0.1155745, -0.016954176, 0.0016706283, -0.00...","[[0.1273245, -0.0030486751, 0.0007928024, 0.01...",POINT (-124.54615 38.18030)
1,-124.516928,38.18068,"[[0.11867357, -0.017182661, 0.0028632905, -0.0...","[[0.12737001, -0.0030908359, 0.000729561, 0.01...",POINT (-124.51693 38.18068)
2,-124.487708,38.181054,"[[0.11585856, -0.016491126, 0.005964425, -0.00...","[[0.12743236, -0.003174662, 0.00089427165, 0.0...",POINT (-124.48771 38.18105)
3,-124.458488,38.181421,"[[0.11883765, -0.01688553, 0.0024151083, -0.00...","[[0.12739776, -0.0031370702, 0.0007280128, 0.0...",POINT (-124.45849 38.18142)
4,-124.429267,38.18178,"[[0.1158724, -0.016283177, 0.0042494186, -0.00...","[[0.12756799, -0.0031793532, 0.00082945044, 0....",POINT (-124.42927 38.18178)


In [18]:
gdf_copy = gdf_results.copy()
gdf_copy["embeddings_2019"] = [embedding.flatten() if embedding is not None and embedding.size > 0 else None for embedding in gdf_results["embeddings_2019"]]
gdf_copy["embeddings_2020"] = [embedding.flatten() if embedding is not None and embedding.size > 0 else None for embedding in gdf_results["embeddings_2020"]]

gdf_copy.to_parquet("train_data/challenge_6.parquet")

In [19]:
gdf = gpd.read_parquet("train_data/challenge_6.parquet")

In [34]:
import ee
import geopandas as gpd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
from shapely.geometry import box, mapping
from shapely.ops import transform
import pyproj

# Initialize Earth Engine
ee.Initialize()

# Load the USFS/GTAC/MTBS annual burn severity mosaics dataset
burn_severity = ee.ImageCollection('USFS/GTAC/MTBS/annual_burn_severity_mosaics/v1').filterDate('2020-01-01', '2020-12-31').mosaic()

def create_bbox_around_point(point, size=2560):
    # Create a bounding box around a point with the given size (meters)
    half_size = size / 2.0
    
    # Define the projections
    wgs84 = pyproj.CRS('EPSG:4326')
    utm_zone = pyproj.CRS(epsg_code)  # Replace with appropriate UTM zone based on your location

    # Project the point to UTM
    project_to_utm = pyproj.Transformer.from_crs(wgs84, utm_zone, always_xy=True).transform
    point_utm = transform(project_to_utm, point)

    # Create the bounding box in UTM
    bbox_utm = box(point_utm.x - half_size, point_utm.y - half_size, point_utm.x + half_size, point_utm.y + half_size)

    # Project the bounding box back to WGS84
    project_to_wgs84 = pyproj.Transformer.from_crs(utm_zone, wgs84, always_xy=True).transform
    bbox_wgs84 = transform(project_to_wgs84, bbox_utm)
    
    return bbox_wgs84


# def create_bbox_around_point(point, size=2560):
#     # Create a bounding box around a point with the given size (meters)
#     half_size = size / 2.0
#     bbox = box(point.x - half_size, point.y - half_size, point.x + half_size, point.y + half_size)
    
#     # Define the projections
#     wgs84 = pyproj.CRS('EPSG:4326')
#     utm = pyproj.CRS(epsg_code)  # Use the appropriate UTM zone for your data

#     project_to_wgs84 = pyproj.Transformer.from_crs(utm, wgs84, always_xy=True).transform

#     # Create the bounding box in UTM
#     expanded_bbox = box(
#         bbox.bounds[0] - half_size, bbox.bounds[1] - half_size,
#         bbox.bounds[2] + half_size, bbox.bounds[3] + half_size
#     )

#     # Project the bounding box back to WGS84
#     bbox_wgs84 = transform(project_to_wgs84, expanded_bbox)
    
#     return bbox_wgs84

def get_severity(geometry):
    if geometry.geom_type == 'Point':
        # Create a bounding box around the point
        bbox = create_bbox_around_point(geometry)
    else:
        bbox = geometry
    
    # Convert the GeoPandas geometry to an Earth Engine geometry
    ee_geometry = ee.Geometry(mapping(bbox))

    # Get the mean severity value within the geometry
    severity_value = burn_severity.reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=ee_geometry,
        scale=30,
        maxPixels=1e9
    ).get('Severity')  # Adjust the band name if necessary
    
    # Return the result
    return severity_value.getInfo() if severity_value is not None else None

def process_geometries(combined_gdf):
    # Get the total number of rows in the GeoDataFrame
    total_rows = len(combined_gdf)

    # Determine the number of threads to use
    max_threads = 10  # Adjust this based on your system and Earth Engine quota
    num_threads = min(total_rows, max_threads)

    results = []

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit all tasks
        future_to_index = {executor.submit(get_severity, row.geometry): index 
                        for index, row in combined_gdf.iterrows()}
        
        # Process as they complete with a progress bar
        with tqdm(total=total_rows, desc="Processing geometries") as pbar:
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    result = future.result()
                except Exception as exc:
                    print(f'Generated an exception: {exc}')
                    result = None
                results.append((index, result))
                pbar.update(1)

    # Sort results by index and extract only the values
    sorted_results = [r[1] for r in sorted(results, key=lambda x: x[0])]
    
    return sorted_results

if __name__ == '__main__':
    # Process the geometries
    results = process_geometries(gdf)

    # Add the results as a new column to the GeoDataFrame
    gdf['mean_severity'] = results

    # Save as Parquet file
    gdf[['geometry', 'mean_severity', 'embeddings_2019', 'embeddings_2020']].to_parquet("train_data/challenge_6.parquet")


Processing geometries: 100%|██████████| 14625/14625 [03:15<00:00, 74.82it/s]


In [42]:
gdf['mean_severity'] = gdf['mean_severity'].fillna(0)
gdf['embeddings_delta'] = gdf['embeddings_2020'] - gdf['embeddings_2019']

In [46]:
import pandas as pd
import numpy as np
import optuna
import tensorflow as tf
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam

class NeuralNetwork:
    def __init__(self, input_shape, layers, dropout_rate, learning_rate, device):
        self.input_shape = input_shape
        self.layers = layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.device = device
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.input_shape,)))
        for layer_size in self.layers:
            model.add(Dense(layer_size, activation='relu'))
            model.add(Dropout(self.dropout_rate))
        model.add(Dense(1, activation='linear'))  # Output layer for regression
        return model

    def compile_model(self):
        optimizer = Adam(learning_rate=self.learning_rate)
        self.model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])  # Use MSE for regression

    def train_model(self, X_train, y_train, epochs=20, batch_size=32, validation_split=0.2):
        with tf.device(self.device):
            history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
        return history

    def evaluate_model(self, X_test, y_test):
        with tf.device(self.device):
            loss, mae = self.model.evaluate(X_test, y_test)
        return loss, mae

    def predict(self, X_test):
        with tf.device(self.device):
            predictions = self.model.predict(X_test)
        return predictions.flatten()

    def calculate_rmse(self, y_test, predictions):
        return np.sqrt(mean_squared_error(y_test, predictions))
    
    def save_model(self, filename):
        self.model.save(filename)

    @classmethod
    def load_model(cls, filename, input_shape, device):
        loaded_model = tf.keras.models.load_model(filename)
        nn = cls(input_shape, [], 0, 0, device)  # Dummy values for layers, dropout_rate, and learning_rate
        nn.model = loaded_model
        return nn

def objective(trial):
    layers = []
    for i in range(trial.suggest_int('n_layers', 1, 3)):
        layers.append(trial.suggest_int(f'n_units_l{i}', 64, 512))
    
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2)
    
    nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate)
    nn.compile_model()
    
    nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
    
    predictions = nn.predict(X_test)
    rmse = nn.calculate_rmse(y_test, predictions)
    
    return rmse

# Assuming combined_gdf is already loaded
combined_gdf_filtered = gdf.dropna()

# Flatten the nested arrays
def flatten_embeddings(embedding):
    if isinstance(embedding, np.ndarray) and embedding.ndim > 0:
        return np.concatenate(embedding) if embedding.ndim > 1 else embedding.flatten()
    else:
        return np.array([])  # Handle cases where embedding might be zero-dimensional or empty

X = np.array([flatten_embeddings(x) for x in combined_gdf_filtered['embeddings_delta'] if flatten_embeddings(x).size > 0])
y = combined_gdf_filtered['mean_severity']

# Ensure that X and y have matching lengths after filtering
X = np.array([x for i, x in enumerate(X) if flatten_embeddings(combined_gdf_filtered['embeddings_delta'].iloc[i]).size > 0])
y = y.iloc[:X.shape[0]]  # Adjust y to have the same number of samples as X

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

# Optimize the hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

# Print the best hyperparameters
print(study.best_params)

# Example usage with the best hyperparameters
best_params = study.best_params
layers = [best_params[f'n_units_l{i}'] for i in range(best_params['n_layers'])]
dropout_rate = best_params['dropout_rate']
learning_rate = best_params['learning_rate']

nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate)
nn.compile_model()
history = nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
loss, mae = nn.evaluate_model(X_test, y_test)
print(f'Test MAE: {mae:.4f}')
predictions = nn.predict(X_test)
rmse = nn.calculate_rmse(y_test, predictions)
print(f'Test RMSE: {rmse:.4f}')

# Save the model
nn.save_model('models/task_6_model.h5')

# Save scaler
joblib.dump(scaler, 'models/6_scaler.joblib')


I0000 00:00:1723214395.824347   26101 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
[I 2024-08-09 14:39:55,830] A new study created in memory with name: no-name-9e3ce90c-52da-4d37-aa59-89427335b8af
I0000 00:00:1723214395.827125   26101 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1723214395.828550   26101 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-

Epoch 1/20


I0000 00:00:1723214396.869066   30931 service.cc:146] XLA service 0x7f73dc003ec0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1723214396.869107   30931 service.cc:154]   StreamExecutor device (0): NVIDIA A10G, Compute Capability 8.6
2024-08-09 14:39:56.896130: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-08-09 14:39:56.985108: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8905


[1m162/293[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 939us/step - loss: 1.9309 - mae: 0.9004

I0000 00:00:1723214398.013950   30931 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - loss: 1.5356 - mae: 0.7853 - val_loss: 0.2175 - val_mae: 0.2504
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2898 - mae: 0.3251 - val_loss: 0.1583 - val_mae: 0.1761
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1824 - mae: 0.2409 - val_loss: 0.1598 - val_mae: 0.1664
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1529 - mae: 0.2027 - val_loss: 0.1879 - val_mae: 0.1593
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1549 - mae: 0.1889 - val_loss: 0.1391 - val_mae: 0.1386
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1354 - mae: 0.1721 - val_loss: 0.1899 - val_mae: 0.1651
Epoch 7/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.14

[I 2024-08-09 14:40:11,332] Trial 0 finished with value: 0.3553612408345228 and parameters: {'n_layers': 2, 'n_units_l0': 272, 'n_units_l1': 360, 'dropout_rate': 0.43104565373724735, 'learning_rate': 0.00096074398667019}. Best is trial 0 with value: 0.3553612408345228.


Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - loss: 8.6150 - mae: 0.9547 - val_loss: 0.2493 - val_mae: 0.2151
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2339 - mae: 0.2199 - val_loss: 0.1564 - val_mae: 0.1795
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1928 - mae: 0.1876 - val_loss: 0.1488 - val_mae: 0.1862
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1825 - mae: 0.1852 - val_loss: 0.1653 - val_mae: 0.1632
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1839 - mae: 0.1800 - val_loss: 0.4237 - val_mae: 0.2490
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2507 - mae: 0.2114 - val_loss: 0.3138 - val_mae: 0.2084
Epoch 7/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step -

[I 2024-08-09 14:40:25,655] Trial 1 finished with value: 0.825847716483011 and parameters: {'n_layers': 2, 'n_units_l0': 243, 'n_units_l1': 350, 'dropout_rate': 0.21831573997296683, 'learning_rate': 0.009020270673316445}. Best is trial 0 with value: 0.3553612408345228.


Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 2.3180 - mae: 0.9660 - val_loss: 0.2761 - val_mae: 0.3271
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3937 - mae: 0.4221 - val_loss: 0.1739 - val_mae: 0.2462
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1981 - mae: 0.2883 - val_loss: 0.1478 - val_mae: 0.2149
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1483 - mae: 0.2398 - val_loss: 0.1301 - val_mae: 0.2001
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1435 - mae: 0.2220 - val_loss: 0.1292 - val_mae: 0.2055
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1338 - mae: 0.2122 - val_loss: 0.1242 - val_mae: 0.1863
Epoch 7/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - 

[I 2024-08-09 14:40:35,457] Trial 2 finished with value: 0.40885032133663946 and parameters: {'n_layers': 1, 'n_units_l0': 178, 'dropout_rate': 0.41237914883175186, 'learning_rate': 0.0012502864203174304}. Best is trial 0 with value: 0.3553612408345228.


Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - loss: 6.6570 - mae: 0.9055 - val_loss: 0.3542 - val_mae: 0.2861
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4769 - mae: 0.3276 - val_loss: 0.6578 - val_mae: 0.4981
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6306 - mae: 0.4473 - val_loss: 0.6573 - val_mae: 0.4746
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6554 - mae: 0.4619 - val_loss: 0.6525 - val_mae: 0.4379
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6571 - mae: 0.4407 - val_loss: 0.6582 - val_mae: 0.4618
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5874 - mae: 0.4338 - val_loss: 0.6576 - val_mae: 0.4688
Epoch 7/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step -




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step


[I 2024-08-09 14:40:53,471] Trial 3 finished with value: 0.8248667771971087 and parameters: {'n_layers': 3, 'n_units_l0': 294, 'n_units_l1': 248, 'n_units_l2': 380, 'dropout_rate': 0.38461843556139164, 'learning_rate': 0.00867389510916457}. Best is trial 0 with value: 0.3553612408345228.


Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 4.3271 - mae: 1.1284 - val_loss: 0.2056 - val_mae: 0.2822
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2592 - mae: 0.3385 - val_loss: 0.1586 - val_mae: 0.2493
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1621 - mae: 0.2574 - val_loss: 0.1324 - val_mae: 0.2079
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1236 - mae: 0.2143 - val_loss: 0.1254 - val_mae: 0.1907
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1216 - mae: 0.2092 - val_loss: 0.1468 - val_mae: 0.2398
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1175 - mae: 0.2043 - val_loss: 0.1274 - val_mae: 0.1973
Epoch 7/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - 

[I 2024-08-09 14:41:03,341] Trial 4 finished with value: 0.39372504782110795 and parameters: {'n_layers': 1, 'n_units_l0': 484, 'dropout_rate': 0.23582210463105585, 'learning_rate': 0.001756710464128349}. Best is trial 0 with value: 0.3553612408345228.


{'n_layers': 2, 'n_units_l0': 272, 'n_units_l1': 360, 'dropout_rate': 0.43104565373724735, 'learning_rate': 0.00096074398667019}
Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 1.3326 - mae: 0.7342 - val_loss: 0.2772 - val_mae: 0.2391
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2552 - mae: 0.3003 - val_loss: 0.1689 - val_mae: 0.1890
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1904 - mae: 0.2341 - val_loss: 0.1410 - val_mae: 0.1550
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1690 - mae: 0.2102 - val_loss: 0.1208 - val_mae: 0.1393
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1478 - mae: 0.1879 - val_loss: 0.1627 - val_mae: 0.1433
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1456 - mae: 0.1



Test RMSE: 0.4056


['models/6_scaler.joblib']

In [54]:
import geopandas as gpd

year = 2022

# Load the GeoJSON file
geojson_path = 'test_data/challenge_6_bb.geojson'
gdf = gpd.read_file(geojson_path)
gdf

Unnamed: 0,geometry
0,"POLYGON ((-115.57766 40.70403, -115.57766 42.0..."


In [55]:
import pyproj

def get_utm_zone(longitude):
    return int((longitude + 180) / 6) + 1

# Get the bounds of the geometry
minx, miny, maxx, maxy = gdf.geometry.bounds.iloc[0]

# Calculate UTM zone
utm_zone = get_utm_zone(minx)

# Check for a suitable projection using pyproj
proj = pyproj.Proj(proj='utm', zone=utm_zone, ellps='WGS84')

# Get the corresponding EPSG code for the UTM zone using pyproj
utm_crs = pyproj.CRS(f"+proj=utm +zone={utm_zone} +datum=WGS84")
epsg_code = utm_crs.to_epsg()

# Reproject the GeoDataFrame to the chosen EPSG code
gdf = gdf.to_crs(epsg=epsg_code)
gdf

Unnamed: 0,geometry
0,"POLYGON ((620156.362 4506875.086, 617625.674 4..."


In [56]:
import numpy as np

# Create a grid of points 5120m apart
x = np.arange(gdf.total_bounds[0], gdf.total_bounds[2], 2560)
y = np.arange(gdf.total_bounds[1], gdf.total_bounds[3], 2560)
xx, yy = np.meshgrid(x, y)
points = np.vstack([xx.ravel(), yy.ravel()]).T

grid = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 0], points[:, 1], crs=gdf.crs))
grid

Unnamed: 0,geometry
0,POINT (444829.369 4506107.413)
1,POINT (447389.369 4506107.413)
2,POINT (449949.369 4506107.413)
3,POINT (452509.369 4506107.413)
4,POINT (455069.369 4506107.413)
...,...
4204,POINT (608669.369 4659707.413)
4205,POINT (611229.369 4659707.413)
4206,POINT (613789.369 4659707.413)
4207,POINT (616349.369 4659707.413)


In [57]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import pystac_client
import stackstac
import torch
from torchvision import transforms as v2
from box import Box
import yaml
import math
from rasterio.enums import Resampling
from tqdm import tqdm
import rasterio
import warnings
import os
import numpy as np
import rioxarray  # Make sure to import rioxarray to extend xarray

from src.model import ClayMAEModule

warnings.filterwarnings("ignore")

STAC_API = "https://earth-search.aws.element84.com/v1"
COLLECTION = "sentinel-2-l2a"

# Load the model and metadata
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ckpt = "https://clay-model-ckpt.s3.amazonaws.com/v0.5.7/mae_v0.5.7_epoch-13_val-loss-0.3098.ckpt"
torch.set_default_device(device)

torch.cuda.empty_cache()  # Clear GPU cache

# Assuming grid is a GeoDataFrame with the points
points = grid.to_crs("EPSG:4326").geometry.apply(lambda x: (x.x, x.y)).tolist()

model = ClayMAEModule.load_from_checkpoint(
    ckpt, metadata_path="configs/metadata.yaml", shuffle=False, mask_ratio=0
)
model.eval()
model = model.to(device)

metadata = Box(yaml.safe_load(open("configs/metadata.yaml")))

# Function to normalize timestamp
def normalize_timestamp(date):
    week = date.isocalendar().week * 2 * np.pi / 52
    hour = date.hour * 2 * np.pi / 24
    return (math.sin(week), math.cos(week)), (math.sin(hour), math.cos(hour))

# Function to normalize lat/lon
def normalize_latlon(lat, lon):
    lat = lat * np.pi / 180
    lon = lon * np.pi / 180
    return (math.sin(lat), math.cos(lat)), (math.sin(lon), math.cos(lon))

def to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif isinstance(data, list):
        return [to_device(v, device) for v in data]
    return data

def process_point(lon, lat, model, metadata, year, device, j):
    model.to(device)  # Ensure the model is on the correct device
    catalog = pystac_client.Client.open(STAC_API)
    search = catalog.search(
        collections=[COLLECTION],
        datetime=f"{year}-01-01/{year}-12-31",
        bbox=(lon - 1e-5, lat - 1e-5, lon + 1e-5, lat + 1e-5),
        max_items=10,
        query={"eo:cloud_cover": {"lt": 80}},
    )

    all_items = search.get_all_items()
    items = list(all_items)
    if not items:
        return None
    
    items = sorted(items, key=lambda x: x.properties.get('eo:cloud_cover', float('inf')))
    lowest_cloud_item = items[0]

    epsg = lowest_cloud_item.properties["proj:epsg"]

    poidf = gpd.GeoDataFrame(
        pd.DataFrame(),
        crs="EPSG:4326",
        geometry=[Point(lon, lat)],
    ).to_crs(epsg)

    coords = poidf.iloc[0].geometry.coords[0]

    size = 256
    gsd = 10
    bounds = (
        coords[0] - (size * gsd) // 2,
        coords[1] - (size * gsd) // 2,
        coords[0] + (size * gsd) // 2,
        coords[1] + (size * gsd) // 2,
    )

    stack = stackstac.stack(
        lowest_cloud_item,
        bounds=bounds,
        snap_bounds=False,
        epsg=epsg,
        resolution=gsd,
        dtype="float32",
        rescale=False,
        fill_value=0,
        assets=["blue", "green", "red", "nir"],
        resampling=Resampling.nearest,
    )

    stack = stack.compute()

    items = []
    dates = []
    for item in all_items:
        if item.datetime.date() not in dates:
            items.append(item)
            dates.append(item.datetime.date())

    platform = "sentinel-2-l2a"
    mean = []
    std = []
    waves = []
    for band in stack.band:
        mean.append(metadata[platform].bands.mean[str(band.values)])
        std.append(metadata[platform].bands.std[str(band.values)])
        waves.append(metadata[platform].bands.wavelength[str(band.values)])

    transform = v2.Compose([v2.Normalize(mean=mean, std=std)])

    datetimes = stack.time.values.astype("datetime64[s]").tolist()
    times = [normalize_timestamp(dat) for dat in datetimes]
    week_norm = [dat[0] for dat in times]
    hour_norm = [dat[1] for dat in times]

    latlons = [normalize_latlon(lat, lon)] * len(times)
    lat_norm = [dat[0] for dat in latlons]
    lon_norm = [dat[1] for dat in latlons]

    pixels = torch.from_numpy(stack.data.astype(np.float32)).to(device)
    pixels = transform(pixels)

    batch_size = 16
    num_batches = math.ceil(len(stack) / batch_size)
    
    embeddings_list = []
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(stack))
        
        batch_pixels = pixels[start_idx:end_idx].to(device)
        batch_time = torch.tensor(np.hstack((week_norm, hour_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        batch_latlon = torch.tensor(np.hstack((lat_norm, lon_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        
        batch_datacube = {
            "platform": platform,
            "time": batch_time,
            "latlon": batch_latlon,
            "pixels": batch_pixels,
            "gsd": torch.tensor(stack.gsd.values).to(device),
            "waves": torch.tensor(waves).to(device),
        }

        batch_datacube = to_device(batch_datacube, device)

        try:
            model = model.to(device)

            with torch.no_grad():
                unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
            batch_embeddings = unmsk_patch[:, 0, :].cpu().numpy()
            embeddings_list.append(batch_embeddings)
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"GPU OOM for point ({lon}, {lat}), batch {i+1}/{num_batches}. Trying CPU...")
                device = torch.device("cpu")
                batch_datacube = to_device(batch_datacube, device)
                model = model.to(device)
                with torch.no_grad():
                    unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
                batch_embeddings = unmsk_patch[:, 0, :].numpy()
                embeddings_list.append(batch_embeddings)
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            else:
                raise e

    embeddings = np.concatenate(embeddings_list, axis=0)
    return embeddings

# Initialize an empty dictionary to store results for both years
results_dict = {"lon": [], "lat": [], "embeddings_2017": [], "embeddings_2018": []}

# Specify the years for the datetime range in the search
years = [2017, 2018]

# Iterate through the points and process each one for both years
for i, point in enumerate(tqdm(points)):
    lon, lat = point
    results_dict["lon"].append(lon)
    results_dict["lat"].append(lat)
    
    for year in years:
        embeddings = process_point(lon, lat, model, metadata, year, device, i)
        if embeddings is not None:
            results_dict[f"embeddings_{year}"].append(embeddings)
        else:
            results_dict[f"embeddings_{year}"].append(None)

# Create a DataFrame from the results
df = pd.DataFrame(results_dict)

# Convert to a GeoDataFrame
gdf_results = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))

# Output the resulting GeoDataFrame
gdf_results.head()


100%|██████████| 4209/4209 [2:28:18<00:00,  2.11s/it]  


Unnamed: 0,lon,lat,embeddings_2017,embeddings_2018,geometry
0,-117.653086,40.704029,"[[-0.093266055, -0.011566908, 0.034409165, 0.1...","[[-0.07646129, -0.047068905, -0.016069224, 0.1...",POINT (-117.65309 40.70403)
1,-117.622783,40.704197,"[[-0.07906168, 0.0015968415, -0.034547463, 0.1...","[[-0.049535986, 0.06324335, -0.05352257, 0.183...",POINT (-117.62278 40.70420)
2,-117.59248,40.704356,"[[0.051406052, -0.02938476, 0.11883815, 0.1303...","[[-0.06238199, 0.06399791, 0.0098732365, 0.160...",POINT (-117.59248 40.70436)
3,-117.562178,40.704508,"[[-0.05635409, -0.039113358, -0.010543512, 0.0...","[[-0.057337593, 0.04036558, 0.036062606, 0.146...",POINT (-117.56218 40.70451)
4,-117.531874,40.704651,"[[-0.06445381, -0.041924402, -0.04184392, 0.12...","[[-0.07720847, 0.021437781, -0.05698099, 0.148...",POINT (-117.53187 40.70465)


In [59]:
gdf_copy = gdf_results.copy()
gdf_copy["embeddings_2017"] = [embedding.flatten() if embedding is not None and embedding.size > 0 else None for embedding in gdf_results["embeddings_2017"]]
gdf_copy["embeddings_2018"] = [embedding.flatten() if embedding is not None and embedding.size > 0 else None for embedding in gdf_results["embeddings_2018"]]

gdf_copy.to_parquet("test_data/challenge_6.parquet")

In [61]:
import ee
import geopandas as gpd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
from shapely.geometry import box, mapping
from shapely.ops import transform
import pyproj

# Initialize Earth Engine
ee.Initialize()

# Load the USFS/GTAC/MTBS annual burn severity mosaics dataset
burn_severity = ee.ImageCollection('USFS/GTAC/MTBS/annual_burn_severity_mosaics/v1').filterDate('2018-01-01', '2018-12-31').mosaic()

def create_bbox_around_point(point, size=2560):
    # Create a bounding box around a point with the given size (meters)
    half_size = size / 2.0
    
    # Define the projections
    wgs84 = pyproj.CRS('EPSG:4326')
    utm_zone = pyproj.CRS(epsg_code)  # Replace with appropriate UTM zone based on your location

    # Project the point to UTM
    project_to_utm = pyproj.Transformer.from_crs(wgs84, utm_zone, always_xy=True).transform
    point_utm = transform(project_to_utm, point)

    # Create the bounding box in UTM
    bbox_utm = box(point_utm.x - half_size, point_utm.y - half_size, point_utm.x + half_size, point_utm.y + half_size)

    # Project the bounding box back to WGS84
    project_to_wgs84 = pyproj.Transformer.from_crs(utm_zone, wgs84, always_xy=True).transform
    bbox_wgs84 = transform(project_to_wgs84, bbox_utm)
    
    return bbox_wgs84

def get_severity(geometry):
    if geometry.geom_type == 'Point':
        # Create a bounding box around the point
        bbox = create_bbox_around_point(geometry)
    else:
        bbox = geometry
    
    # Convert the GeoPandas geometry to an Earth Engine geometry
    ee_geometry = ee.Geometry(mapping(bbox))

    # Get the mean severity value within the geometry
    severity_value = burn_severity.reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=ee_geometry,
        scale=30,
        maxPixels=1e9
    ).get('Severity')  # Adjust the band name if necessary
    
    # Return the result
    return severity_value.getInfo() if severity_value is not None else None

def process_geometries(combined_gdf):
    # Get the total number of rows in the GeoDataFrame
    total_rows = len(combined_gdf)

    # Determine the number of threads to use
    max_threads = 10  # Adjust this based on your system and Earth Engine quota
    num_threads = min(total_rows, max_threads)

    results = []

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit all tasks
        future_to_index = {executor.submit(get_severity, row.geometry): index 
                        for index, row in combined_gdf.iterrows()}
        
        # Process as they complete with a progress bar
        with tqdm(total=total_rows, desc="Processing geometries") as pbar:
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    result = future.result()
                except Exception as exc:
                    print(f'Generated an exception: {exc}')
                    result = None
                results.append((index, result))
                pbar.update(1)

    # Sort results by index and extract only the values
    sorted_results = [r[1] for r in sorted(results, key=lambda x: x[0])]
    
    return sorted_results

if __name__ == '__main__':
    # Process the geometries
    results = process_geometries(gdf_copy)

    # Add the results as a new column to the GeoDataFrame
    gdf_copy['mean_severity'] = results

    # Save as Parquet file
    gdf_copy[['geometry', 'mean_severity', 'embeddings_2017', 'embeddings_2018']].to_parquet("test_data/challenge_6.parquet")


Processing geometries: 100%|██████████| 4209/4209 [00:54<00:00, 77.58it/s]


In [63]:
gdf_copy['mean_severity'] = gdf_copy['mean_severity'].fillna(0)
gdf_copy['embeddings_delta'] = gdf_copy['embeddings_2018'] - gdf_copy['embeddings_2017']

In [67]:
import tensorflow as tf

# Detect if GPU is available
device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

# Load the model
loaded_nn = NeuralNetwork.load_model('models/agb_regression_model.h5', input_shape=768, device=device)

# Prepare your new data (assuming it's in the same format as your training data)
new_data = np.squeeze(gdf_copy['embeddings_delta'].tolist())
new_data = pd.DataFrame(new_data)  # Ensure the new data is in DataFrame format

# Standardize the new data using the saved scaler
scaler = joblib.load('models/scaler.joblib')
new_data_scaled = scaler.transform(new_data)

# Make predictions
new_predictions = loaded_nn.predict(new_data_scaled)

gdf_copy['pred_severity'] = new_predictions



[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [68]:
print("Test set RMSE:", np.sqrt(np.mean((gdf_copy['pred_severity']-gdf_copy['mean_severity'])**2)))
print("Test set corr:", np.corrcoef(gdf_copy['pred_severity'],gdf_copy['mean_severity'])[0][1])

Test set RMSE: 1.842589182391211
Test set corr: 0.05585754431984979


In [70]:
gdf_copy[['geometry', 'mean_severity', 'pred_severity']].to_file("~/severity.geojson", driver="GeoJSON")