## Task 1: landcover classification

### Retrieve S2 embeddings from an S3 bucket over California

In [1]:
import geopandas as gpd
import s3fs
import pandas as pd
import os

from tqdm import tqdm


file_path = 'models/land_cover_model.h5'

if os.path.exists(file_path):
    print("Model already exists")
else:
    # S3 bucket and prefix
    s3_bucket = 'clay-worker-bucket-dev-small-tasks'
    s3_prefix = '_data/gpq/87/'#51/' #2022 S2 Cali embeddings

    # Initialize s3fs filesystem
    fs = s3fs.S3FileSystem()

    # List all the GeoParquet files in the specified S3 directory
    geo_parquet_files = fs.glob(f's3://{s3_bucket}/{s3_prefix}*.gpq')

    # List to store GeoDataFrames
    gdfs = []

    # Load each GeoParquet file into a GeoDataFrame and append to the list
    for file in tqdm(geo_parquet_files):
        gdf = gpd.read_parquet("s3://"+file, storage_options={"anon": False, "client_kwargs": {"region_name": "us-west-2"}})
        gdfs.append(gdf)

    # Concatenate all GeoDataFrames into a single GeoDataFrame
    combined_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

    # Display the concatenated GeoDataFrame
    combined_gdf


Model already exists


### Join most common land use value that corresponds to each chip

In [2]:
import ee

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

if os.path.exists(file_path):
    print("Model already exists")
else:
    # Initialize Earth Engine
    ee.Initialize()

    # Load the ESRI Global LULC dataset
    lulc = ee.ImageCollection("projects/sat-io/open-datasets/landcover/ESRI_Global-LULC_10m_TS").mosaic()

    def get_most_common_lulc(geometry):
        # Convert the GeoPandas geometry to an Earth Engine geometry
        ee_geometry = ee.Geometry.Rectangle(geometry.bounds)
        
        # Get the LULC values within the bounding box
        lulc_values = lulc.reduceRegion(
            reducer=ee.Reducer.frequencyHistogram(),
            geometry=ee_geometry,
            scale=10,
            maxPixels=1e9
        ).get('b1')
        
        # Find the most common LULC value
        lulc_dict = ee.Dictionary(lulc_values)
        most_common = lulc_dict.keys().sort(lulc_dict.values()).get(-1)
        
        # Return the result
        return int(most_common.getInfo())

    def process_geometries(combined_gdf):
        # Get the total number of rows in the GeoDataFrame
        total_rows = len(combined_gdf)

        # Determine the number of threads to use
        max_threads = 10  # Adjust this based on your system and Earth Engine quota
        num_threads = min(total_rows, max_threads)

        results = []

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Submit all tasks
            future_to_index = {executor.submit(get_most_common_lulc, row.geometry): index 
                            for index, row in combined_gdf.iterrows()}
            
            # Process as they complete with a progress bar
            with tqdm(total=total_rows, desc="Processing bounding boxes") as pbar:
                for future in as_completed(future_to_index):
                    index = future_to_index[future]
                    try:
                        result = future.result()
                    except Exception as exc:
                        print(f'Generated an exception: {exc}')
                        result = None
                    results.append((index, result))
                    pbar.update(1)

        # Sort results by index and extract only the values
        sorted_results = [r[1] for r in sorted(results, key=lambda x: x[0])]
        
        return sorted_results

    if __name__ == '__main__':
        # Process the geometries
        results = process_geometries(combined_gdf)

        # Add the results as a new column to the GeoDataFrame
        combined_gdf['most_common_lulc'] = results

        # Save as GeoJSON
        combined_gdf[['geometry', 'most_common_lulc']].to_file("test_lulc.geojson", driver="GeoJSON")

Model already exists


### Tune hyperparameters for land use classifier using Optuna

In [3]:
import pandas as pd
import numpy as np
import optuna
import tensorflow as tf
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from keras.utils import to_categorical

class NeuralNetwork:
    def __init__(self, input_shape, layers, dropout_rate, learning_rate, num_classes, device):
        self.input_shape = input_shape
        self.layers = layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.num_classes = num_classes
        self.device = device
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.input_shape,)))
        for layer_size in self.layers:
            model.add(Dense(layer_size, activation='relu'))
            model.add(Dropout(self.dropout_rate))
        model.add(Dense(self.num_classes, activation='softmax'))
        return model

    def compile_model(self):
        optimizer = Adam(learning_rate=self.learning_rate)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    def train_model(self, X_train, y_train, epochs=20, batch_size=32, validation_split=0.2):
        with tf.device(self.device):
            history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
        return history

    def evaluate_model(self, X_test, y_test):
        with tf.device(self.device):
            loss, accuracy = self.model.evaluate(X_test, y_test)
        return loss, accuracy

    def predict(self, X_test):
        with tf.device(self.device):
            predictions = self.model.predict(X_test)
        return np.argmax(predictions, axis=1)

    def calculate_f1(self, y_test, predictions):
        return f1_score(np.argmax(y_test, axis=1), predictions, average='weighted')
    
    def save_model(self, filename):
        self.model.save(filename)

    @classmethod
    def load_model(cls, filename, input_shape, num_classes, device):
        loaded_model = tf.keras.models.load_model(filename)
        nn = cls(input_shape, [], 0, 0, num_classes, device)  # Dummy values for layers, dropout_rate, and learning_rate
        nn.model = loaded_model
        return nn


def objective(trial):
    layers = []
    for i in range(trial.suggest_int('n_layers', 1, 3)):
        layers.append(trial.suggest_int(f'n_units_l{i}', 64, 512))
    
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2)
    
    nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate, num_classes=num_classes)
    nn.compile_model()
    
    nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
    
    predictions = nn.predict(X_test)
    f1 = nn.calculate_f1(y_test, predictions)
    
    return f1

if os.path.exists(file_path):
    print("Model already exists")
else:
    # Assuming combined_gdf is already loaded
    X = combined_gdf['embeddings'].tolist()
    X = pd.DataFrame(X)
    y = combined_gdf['most_common_lulc']

    # Encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    num_classes = len(label_encoder.classes_)

    # Convert to one-hot encoding
    y_onehot = to_categorical(y_encoded)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

    # Optimize the hyperparameters
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)

    # Print the best hyperparameters
    print(study.best_params)

    # Example usage with the best hyperparameters
    best_params = study.best_params
    layers = [best_params[f'n_units_l{i}'] for i in range(best_params['n_layers'])]
    dropout_rate = best_params['dropout_rate']
    learning_rate = best_params['learning_rate']

    nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate, num_classes=num_classes)
    nn.compile_model()
    history = nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
    loss, accuracy = nn.evaluate_model(X_test, y_test)
    print(f'Test Accuracy: {accuracy:.4f}')
    predictions = nn.predict(X_test)
    f1 = nn.calculate_f1(y_test, predictions)
    print(f'Test F1 Score: {f1:.4f}')

    # Save the model
    nn.save_model('models/land_cover_model.h5')

    # Save scaler and label_encoder
    joblib.dump(scaler, 'models/scaler.joblib')
    joblib.dump(label_encoder, 'models/label_encoder.joblib')

    # If you want to get the actual class labels
    predicted_classes = label_encoder.inverse_transform(predictions)

  from .autonotebook import tqdm as notebook_tqdm
2024-08-06 22:51:35.185053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 22:51:35.410486: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 22:51:35.485016: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-06 22:51:35.980377: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model already exists


### Predict on test set

#### Load in test set geojson and turn into grid of points

In [24]:
import geopandas as gpd

year = 2022

# Load the GeoJSON file
geojson_path = 'test_data/challenge_1_bb.geojson'
gdf = gpd.read_file(geojson_path)
gdf

Unnamed: 0,geometry
0,"POLYGON ((-106.08092 35.78627, -106.08092 35.4..."


In [25]:
import pyproj

def get_utm_zone(longitude):
    return int((longitude + 180) / 6) + 1

# Get the bounds of the geometry
minx, miny, maxx, maxy = gdf.geometry.bounds.iloc[0]

# Calculate UTM zone
utm_zone = get_utm_zone(minx)

# Check for a suitable projection using pyproj
proj = pyproj.Proj(proj='utm', zone=utm_zone, ellps='WGS84')

# Get the corresponding EPSG code for the UTM zone using pyproj
utm_crs = pyproj.CRS(f"+proj=utm +zone={utm_zone} +datum=WGS84")
epsg_code = utm_crs.to_epsg()

# Reproject the GeoDataFrame to the chosen EPSG code
gdf = gdf.to_crs(epsg=epsg_code)
gdf

Unnamed: 0,geometry
0,"POLYGON ((402315.263 3960781.699, 401878.759 3..."


In [26]:
import numpy as np

# Create a grid of points 5120m apart
x = np.arange(gdf.total_bounds[0], gdf.total_bounds[2], 2560)
y = np.arange(gdf.total_bounds[1], gdf.total_bounds[3], 2560)
xx, yy = np.meshgrid(x, y)
points = np.vstack([xx.ravel(), yy.ravel()]).T

grid = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 0], points[:, 1], crs=gdf.crs))
grid

Unnamed: 0,geometry
0,POINT (401878.759 3920624.607)
1,POINT (404438.759 3920624.607)
2,POINT (406998.759 3920624.607)
3,POINT (409558.759 3920624.607)
4,POINT (412118.759 3920624.607)
...,...
331,POINT (442838.759 3959024.607)
332,POINT (445398.759 3959024.607)
333,POINT (447958.759 3959024.607)
334,POINT (450518.759 3959024.607)


#### Download S2 data for the grid defined above

In [27]:
# import stackstac
# import rasterio
# import pystac_client
# import rioxarray

# import warnings
# warnings.filterwarnings("ignore")

# from shapely.geometry import Point

# def download_stack_images(start_date, end_date, grid, output_directory):
#     os.makedirs(output_directory, exist_ok=True)
    
#     # Band groups updated with new bands
#     BAND_GROUPS = {
#         "rgb": ["red", "green", "blue"],
#         "rededge": ["rededge1", "rededge2", "rededge3", "nir08"],
#         "nir": ["nir"],
#         "swir": ["swir16", "swir22"],
#         "sar": ["vv", "vh"],
#     }

#     # STAC API and Collection details
#     STAC_API = "https://earth-search.aws.element84.com/v1"
#     COLLECTION = "sentinel-2-l2a"

#     # Initialize STAC client
#     catalog = pystac_client.Client.open(STAC_API)

#     # Points of Interest as a list of tuples (longitude, latitude)
#     points = grid.to_crs("EPSG:4326").geometry.apply(lambda x: (x.x, x.y)).tolist()

#     # Iterate over each point of interest
#     cnt = 0
#     indices_for_join = []
#     for lon, lat in tqdm(points):
#         bbox = [lon - 1e-5, lat - 1e-5, lon + 1e-5, lat + 1e-5]

#         # Search for items with low cloud cover, without sorting by cloud cover
#         search = catalog.search(
#             collections=[COLLECTION],
#             datetime=f"{start_date}/{end_date}",
#             bbox=bbox,
#             max_items=10,
#             query={"eo:cloud_cover": {"lt": 20}}  # Assuming 'eo:cloud_cover' is valid for filtering
#         )

#         items = list(search.get_items())
#         if not items:
#             print("No items found with low cloud cover.")
#             continue

#         # Optionally, sort items manually by cloud cover if it's available in the properties
#         items = sorted(items, key=lambda x: x.properties.get('eo:cloud_cover', float('inf')))

#         # Select the item with the lowest cloud cover
#         lowest_cloud_item = items[0]

#         # Convert point into the image projection of the selected item
#         epsg = lowest_cloud_item.properties["proj:epsg"]
#         point_gdf = gpd.GeoDataFrame(
#             [{'geometry': Point(lon, lat)}], 
#             crs='EPSG:4326'
#         ).to_crs(epsg=epsg)
#         coords = list(point_gdf.iloc[0].geometry.coords)[0]

#         bounds = (
#             coords[0] - 1280, coords[1] - 1280,
#             coords[0] + 1280, coords[1] + 1280,
#         )

#         # Process and visualize the selected image
#         stack = stackstac.stack(
#             [lowest_cloud_item],
#             bounds=bounds,
#             snap_bounds=False,
#             epsg=epsg,
#             resolution=10,
#             dtype="float32",
#             rescale=False,
#             fill_value=np.nan,
#             assets=BAND_GROUPS["rgb"] + BAND_GROUPS["nir"],  # TODO: Add more/all of the Sentinel 2 bands to see if embeddings improve
#         ).compute()

#         # Assuming 'stack' is your xarray DataArray loaded with stackstac
#         stack = stack.rio.write_crs("EPSG:4326")  # You can change to your specific EPSG code

#         mgrs = str(stack.coords["grid:code"].values).split("-")[1]
#         date = str(stack.time.values)[2:11]

#         output_path = os.path.join(output_directory, f"stack_{lon}_{lat}_{cnt}.tif")
#         indices_for_join.append(cnt)

#         if 'time' in stack.dims:
#             stack = stack.isel(time=0)

#         # Write the stack to a TIFF file
#         with rasterio.open(
#                 output_path, 'w',
#                 driver='GTiff',
#                 height=stack.shape[1],
#                 width=stack.shape[2],
#                 count=len(stack.band),  # Number of bands
#                 dtype=str(stack.dtype),
#                 crs=epsg, # TODO: Confirm this
#                 transform=stack.rio.transform()
#             ) as tif:
#             for i, band in enumerate(stack.band, start=1):
#                 tif.write(stack.sel(band=band).values, i)

#         # Reopen the file to add metadata
#         with rasterio.open(output_path, "r+") as rst:
#             rst.update_tags(date=date)

#         cnt += 1

#     grid["join_index"] = indices_for_join

In [28]:
# download_stack_images(f"{year}-01-01", f"{year}-12-31", grid, "test_data/embeddings/challenge_1")

#### Join LULC data on grid

In [29]:
# Initialize Earth Engine
ee.Initialize()

# Load the ESRI Global LULC dataset
lulc = ee.ImageCollection("projects/sat-io/open-datasets/landcover/ESRI_Global-LULC_10m_TS").mosaic()

def get_most_common_lulc(geometry):
    # Convert the GeoPandas geometry to an Earth Engine geometry
    ee_geometry = ee.Geometry.Rectangle(geometry.bounds)
    
    # Get the LULC values within the bounding box
    lulc_values = lulc.reduceRegion(
        reducer=ee.Reducer.frequencyHistogram(),
        geometry=ee_geometry,
        scale=10,
        maxPixels=1e9
    ).get('b1')
    
    # Find the most common LULC value
    lulc_dict = ee.Dictionary(lulc_values)
    most_common = lulc_dict.keys().sort(lulc_dict.values()).get(-1)
    
    # Return the result
    return int(most_common.getInfo())

def process_geometries(combined_gdf):
    # Get the total number of rows in the GeoDataFrame
    total_rows = len(combined_gdf)

    # Determine the number of threads to use
    max_threads = 10  # Adjust this based on your system and Earth Engine quota
    num_threads = min(total_rows, max_threads)

    results = []

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit all tasks
        future_to_index = {executor.submit(get_most_common_lulc, row.geometry): index 
                        for index, row in combined_gdf.iterrows()}
        
        # Process as they complete with a progress bar
        with tqdm(total=total_rows, desc="Processing bounding boxes") as pbar:
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    result = future.result()
                except Exception as exc:
                    print(f'Generated an exception: {exc}')
                    result = None
                results.append((index, result))
                pbar.update(1)

    # Sort results by index and extract only the values
    sorted_results = [r[1] for r in sorted(results, key=lambda x: x[0])]
    
    return sorted_results

if __name__ == '__main__':
    # Process the geometries
    results = process_geometries(grid.to_crs("EPSG:4326"))

    # Add the results as a new column to the GeoDataFrame
    grid['most_common_lulc'] = results

    # Save as GeoJSON
    grid[['geometry', 'most_common_lulc']].to_file("test_lulc_v2.geojson", driver="GeoJSON")

Processing bounding boxes: 100%|██████████| 336/336 [00:05<00:00, 66.64it/s]


#### Generate embeddings for the images

In [10]:
import geopandas as gpd

year = 2022

# Load the GeoJSON file
geojson_path = 'train_data/challenge_1_bb_TRAIN.geojson'
gdf = gpd.read_file(geojson_path)
gdf

Unnamed: 0,geometry
0,"POLYGON ((-123.28134 47.75348, -123.28134 46.2..."


In [11]:
import pyproj

def get_utm_zone(longitude):
    return int((longitude + 180) / 6) + 1

# Get the bounds of the geometry
minx, miny, maxx, maxy = gdf.geometry.bounds.iloc[0]

# Calculate UTM zone
utm_zone = get_utm_zone(minx)

# Check for a suitable projection using pyproj
proj = pyproj.Proj(proj='utm', zone=utm_zone, ellps='WGS84')

# Get the corresponding EPSG code for the UTM zone using pyproj
utm_crs = pyproj.CRS(f"+proj=utm +zone={utm_zone} +datum=WGS84")
epsg_code = utm_crs.to_epsg()

# Reproject the GeoDataFrame to the chosen EPSG code
gdf = gdf.to_crs(epsg=epsg_code)
gdf

Unnamed: 0,geometry
0,"POLYGON ((478913.685 5288938.981, 478311.614 5..."


In [12]:
import numpy as np

# Create a grid of points 5120m apart
x = np.arange(gdf.total_bounds[0], gdf.total_bounds[2], 2560)
y = np.arange(gdf.total_bounds[1], gdf.total_bounds[3], 2560)
xx, yy = np.meshgrid(x, y)
points = np.vstack([xx.ravel(), yy.ravel()]).T

grid = gpd.GeoDataFrame(geometry=gpd.points_from_xy(points[:, 0], points[:, 1], crs=gdf.crs))
grid

Unnamed: 0,geometry
0,POINT (478311.614 5121278.860)
1,POINT (480871.614 5121278.860)
2,POINT (483431.614 5121278.860)
3,POINT (485991.614 5121278.860)
4,POINT (488551.614 5121278.860)
...,...
6659,POINT (716391.614 5292798.860)
6660,POINT (718951.614 5292798.860)
6661,POINT (721511.614 5292798.860)
6662,POINT (724071.614 5292798.860)


In [15]:
# Initialize Earth Engine
ee.Initialize()

# Load the ESRI Global LULC dataset
lulc = ee.ImageCollection("projects/sat-io/open-datasets/landcover/ESRI_Global-LULC_10m_TS").mosaic()

def get_most_common_lulc(geometry):
    # Convert the GeoPandas geometry to an Earth Engine geometry
    ee_geometry = ee.Geometry.Rectangle(geometry.bounds)
    
    # Get the LULC values within the bounding box
    lulc_values = lulc.reduceRegion(
        reducer=ee.Reducer.frequencyHistogram(),
        geometry=ee_geometry,
        scale=10,
        maxPixels=1e9
    ).get('b1')
    
    # Find the most common LULC value
    lulc_dict = ee.Dictionary(lulc_values)
    most_common = lulc_dict.keys().sort(lulc_dict.values()).get(-1)
    
    # Return the result
    return int(most_common.getInfo())

def process_geometries(combined_gdf):
    # Get the total number of rows in the GeoDataFrame
    total_rows = len(combined_gdf)

    # Determine the number of threads to use
    max_threads = 10  # Adjust this based on your system and Earth Engine quota
    num_threads = min(total_rows, max_threads)

    results = []

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit all tasks
        future_to_index = {executor.submit(get_most_common_lulc, row.geometry): index 
                        for index, row in combined_gdf.iterrows()}
        
        # Process as they complete with a progress bar
        with tqdm(total=total_rows, desc="Processing bounding boxes") as pbar:
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    result = future.result()
                except Exception as exc:
                    print(f'Generated an exception: {exc}')
                    result = None
                results.append((index, result))
                pbar.update(1)

    # Sort results by index and extract only the values
    sorted_results = [r[1] for r in sorted(results, key=lambda x: x[0])]
    
    return sorted_results

if __name__ == '__main__':
    # Process the geometries
    results = process_geometries(grid.to_crs("EPSG:4326"))

    # Add the results as a new column to the GeoDataFrame
    grid['most_common_lulc'] = results

    # Save as GeoJSON
    grid[['geometry', 'most_common_lulc']].to_file("TRAIN_lulc.geojson", driver="GeoJSON")

Processing bounding boxes: 100%|██████████| 6664/6664 [00:54<00:00, 123.33it/s]


In [16]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import pystac_client
import stackstac
import torch
from torchvision import transforms as v2
from box import Box
import yaml
import math
from rasterio.enums import Resampling
from tqdm import tqdm
import rasterio
import warnings
import os
import numpy as np
import rioxarray  # Make sure to import rioxarray to extend xarray

from src.model import ClayMAEModule

warnings.filterwarnings("ignore")

STAC_API = "https://earth-search.aws.element84.com/v1"
COLLECTION = "sentinel-2-l2a"

# Load the model and metadata
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ckpt = "https://clay-model-ckpt.s3.amazonaws.com/v0.5.7/mae_v0.5.7_epoch-13_val-loss-0.3098.ckpt"
torch.set_default_device(device)

torch.cuda.empty_cache()  # Clear GPU cache

# Assuming grid is a GeoDataFrame with the points
points = grid.to_crs("EPSG:4326").geometry.apply(lambda x: (x.x, x.y)).tolist()

model = ClayMAEModule.load_from_checkpoint(
    ckpt, metadata_path="configs/metadata.yaml", shuffle=False, mask_ratio=0
)
model.eval()
model = model.to(device)

metadata = Box(yaml.safe_load(open("configs/metadata.yaml")))

# Function to normalize timestamp
def normalize_timestamp(date):
    week = date.isocalendar().week * 2 * np.pi / 52
    hour = date.hour * 2 * np.pi / 24
    return (math.sin(week), math.cos(week)), (math.sin(hour), math.cos(hour))

# Function to normalize lat/lon
def normalize_latlon(lat, lon):
    lat = lat * np.pi / 180
    lon = lon * np.pi / 180
    return (math.sin(lat), math.cos(lat)), (math.sin(lon), math.cos(lon))

def to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif isinstance(data, list):
        return [to_device(v, device) for v in data]
    return data

def process_point(lon, lat, model, metadata, year, device, j):
    model.to(device)  # Ensure the model is on the correct device
    catalog = pystac_client.Client.open(STAC_API)
    search = catalog.search(
        collections=[COLLECTION],
        datetime=f"{year}-01-01/{year}-12-31",
        bbox=(lon - 1e-5, lat - 1e-5, lon + 1e-5, lat + 1e-5),
        max_items=10,
        query={"eo:cloud_cover": {"lt": 80}},
    )

    all_items = search.get_all_items()
    items = list(all_items)
    if not items:
        return None
    
    items = sorted(items, key=lambda x: x.properties.get('eo:cloud_cover', float('inf')))
    lowest_cloud_item = items[0]

    epsg = lowest_cloud_item.properties["proj:epsg"]

    poidf = gpd.GeoDataFrame(
        pd.DataFrame(),
        crs="EPSG:4326",
        geometry=[Point(lon, lat)],
    ).to_crs(epsg)

    coords = poidf.iloc[0].geometry.coords[0]

    size = 256
    gsd = 10
    bounds = (
        coords[0] - (size * gsd) // 2,
        coords[1] - (size * gsd) // 2,
        coords[0] + (size * gsd) // 2,
        coords[1] + (size * gsd) // 2,
    )

    stack = stackstac.stack(
        lowest_cloud_item,
        bounds=bounds,
        snap_bounds=False,
        epsg=epsg,
        resolution=gsd,
        dtype="float32",
        rescale=False,
        fill_value=0,
        assets=["blue", "green", "red", "nir"],
        resampling=Resampling.nearest,
    )

    stack = stack.compute()

    items = []
    dates = []
    for item in all_items:
        if item.datetime.date() not in dates:
            items.append(item)
            dates.append(item.datetime.date())

    # date = str(stack.time.values)[2:11]

    # output_path = os.path.join("train_data/embeddings/challenge_1/", f"stack_{lon}_{lat}_{j}.tif")
    
    # # Write the stack to a TIFF file
    # with rasterio.open(
    #         output_path, 'w',
    #         driver='GTiff',
    #         height=stack.shape[2],
    #         width=stack.shape[3],
    #         count=len(stack.band),  # Number of bands
    #         dtype=str(stack.dtype),
    #         crs=epsg,
    #         transform=stack.rio.transform()
    #     ) as tif:
    #     for i, band in enumerate(stack.band, start=1):
    #         tif.write(np.squeeze(stack.sel(band=band).values), i)

    # # Reopen the file to add metadata
    # with rasterio.open(output_path, "r+") as rst:
    #     rst.update_tags(date=date)

    platform = "sentinel-2-l2a"
    mean = []
    std = []
    waves = []
    for band in stack.band:
        mean.append(metadata[platform].bands.mean[str(band.values)])
        std.append(metadata[platform].bands.std[str(band.values)])
        waves.append(metadata[platform].bands.wavelength[str(band.values)])

    transform = v2.Compose([v2.Normalize(mean=mean, std=std)])

    datetimes = stack.time.values.astype("datetime64[s]").tolist()
    times = [normalize_timestamp(dat) for dat in datetimes]
    week_norm = [dat[0] for dat in times]
    hour_norm = [dat[1] for dat in times]

    latlons = [normalize_latlon(lat, lon)] * len(times)
    lat_norm = [dat[0] for dat in latlons]
    lon_norm = [dat[1] for dat in latlons]

    pixels = torch.from_numpy(stack.data.astype(np.float32)).to(device)
    pixels = transform(pixels)

    batch_size = 16
    num_batches = math.ceil(len(stack) / batch_size)
    
    embeddings_list = []
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(stack))
        
        batch_pixels = pixels[start_idx:end_idx].to(device)
        batch_time = torch.tensor(np.hstack((week_norm, hour_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        batch_latlon = torch.tensor(np.hstack((lat_norm, lon_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        
        batch_datacube = {
            "platform": platform,
            "time": batch_time,
            "latlon": batch_latlon,
            "pixels": batch_pixels,
            "gsd": torch.tensor(stack.gsd.values).to(device),
            "waves": torch.tensor(waves).to(device),
        }

        batch_datacube = to_device(batch_datacube, device)

        try:
            model = model.to(device)

            with torch.no_grad():
                unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
            batch_embeddings = unmsk_patch[:, 0, :].cpu().numpy()
            embeddings_list.append(batch_embeddings)
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"GPU OOM for point ({lon}, {lat}), batch {i+1}/{num_batches}. Trying CPU...")
                device = torch.device("cpu")
                batch_datacube = to_device(batch_datacube, device)
                model = model.to(device)
                with torch.no_grad():
                    unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
                batch_embeddings = unmsk_patch[:, 0, :].numpy()
                embeddings_list.append(batch_embeddings)
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            else:
                raise e

    embeddings = np.concatenate(embeddings_list, axis=0)
    return embeddings

# Specify the year for the datetime range in the search
year = 2022

# Store results in a list
results = []

# Iterate through the points and process each one
for i, point in enumerate(tqdm(points)):
    lon, lat = point
    embeddings = process_point(lon, lat, model, metadata, year, device, i)
    if embeddings is not None:
        results.append((lon, lat, embeddings, grid.loc[i, 'most_common_lulc']))

# Create a DataFrame from the results
df = pd.DataFrame(results, columns=["lon", "lat", "embeddings", "most_common_lulc"])

# Convert to a GeoDataFrame
gdf_results = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))

# Output the resulting GeoDataFrame
gdf_results.head()

100%|██████████| 6664/6664 [1:29:34<00:00,  1.24it/s]


Unnamed: 0,lon,lat,embeddings,most_common_lulc,geometry
0,-123.281338,46.24474,"[[0.0954321, -0.0801714, -0.029764883, -0.0657...",2,POINT (-123.28134 46.24474)
1,-123.24813,46.244817,"[[0.07401827, -0.089014255, -0.008051335, -0.0...",2,POINT (-123.24813 46.24482)
2,-123.214923,46.244884,"[[0.053021867, -0.047577165, -0.024503708, -0....",2,POINT (-123.21492 46.24488)
3,-123.181715,46.244942,"[[0.026906254, -0.064052366, -0.030531697, -0....",2,POINT (-123.18171 46.24494)
4,-123.148507,46.24499,"[[0.05992042, -0.044143558, -0.01674647, -0.01...",11,POINT (-123.14851 46.24499)


In [22]:
import pandas as pd
import numpy as np
import optuna
import tensorflow as tf
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from keras.utils import to_categorical

class NeuralNetwork:
    def __init__(self, input_shape, layers, dropout_rate, learning_rate, num_classes, device):
        self.input_shape = input_shape
        self.layers = layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.num_classes = num_classes
        self.device = device
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.input_shape,)))
        for layer_size in self.layers:
            model.add(Dense(layer_size, activation='relu'))
            model.add(Dropout(self.dropout_rate))
        model.add(Dense(self.num_classes, activation='softmax'))
        return model

    def compile_model(self):
        optimizer = Adam(learning_rate=self.learning_rate)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    def train_model(self, X_train, y_train, epochs=20, batch_size=32, validation_split=0.2):
        with tf.device(self.device):
            history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
        return history

    def evaluate_model(self, X_test, y_test):
        with tf.device(self.device):
            loss, accuracy = self.model.evaluate(X_test, y_test)
        return loss, accuracy

    def predict(self, X_test):
        with tf.device(self.device):
            predictions = self.model.predict(X_test)
        return np.argmax(predictions, axis=1)

    def calculate_f1(self, y_test, predictions):
        return f1_score(np.argmax(y_test, axis=1), predictions, average='weighted')
    
    def save_model(self, filename):
        self.model.save(filename)

    @classmethod
    def load_model(cls, filename, input_shape, num_classes, device):
        loaded_model = tf.keras.models.load_model(filename)
        nn = cls(input_shape, [], 0, 0, num_classes, device)  # Dummy values for layers, dropout_rate, and learning_rate
        nn.model = loaded_model
        return nn


def objective(trial):
    layers = []
    for i in range(trial.suggest_int('n_layers', 1, 3)):
        layers.append(trial.suggest_int(f'n_units_l{i}', 64, 512))
    
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2)
    
    nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate, num_classes=num_classes)
    nn.compile_model()
    
    nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
    
    predictions = nn.predict(X_test)
    f1 = nn.calculate_f1(y_test, predictions)
    
    return f1

if os.path.exists(file_path):
    print("Model already exists")
else:
# Assuming combined_gdf is already loaded
    X = gdf_results['embeddings'].tolist()
    X = pd.DataFrame(np.squeeze(X))
    y = gdf_results['most_common_lulc']

    # Encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    num_classes = len(label_encoder.classes_)

    # Convert to one-hot encoding
    y_onehot = to_categorical(y_encoded)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

    # Optimize the hyperparameters
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)

    # Print the best hyperparameters
    print(study.best_params)

    # Example usage with the best hyperparameters
    best_params = study.best_params
    layers = [best_params[f'n_units_l{i}'] for i in range(best_params['n_layers'])]
    dropout_rate = best_params['dropout_rate']
    learning_rate = best_params['learning_rate']

    nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate, num_classes=num_classes)
    nn.compile_model()
    history = nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
    loss, accuracy = nn.evaluate_model(X_test, y_test)
    print(f'Test Accuracy: {accuracy:.4f}')
    predictions = nn.predict(X_test)
    f1 = nn.calculate_f1(y_test, predictions)
    print(f'Test F1 Score: {f1:.4f}')

    # Save the model
    nn.save_model('models/land_cover_model.h5')

    # Save scaler and label_encoder
    joblib.dump(scaler, 'models/scaler.joblib')
    joblib.dump(label_encoder, 'models/label_encoder.joblib')

    # If you want to get the actual class labels
    predicted_classes = label_encoder.inverse_transform(predictions)

I0000 00:00:1722999059.663877    3069 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
[I 2024-08-07 02:50:59,682] A new study created in memory with name: no-name-53fd32fe-de38-462b-aaa6-d3745f900a31
I0000 00:00:1722999059.678687    3069 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722999059.680160    3069 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-

Epoch 1/20


I0000 00:00:1722999060.585835    3894 service.cc:146] XLA service 0x7f81d000a840 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722999060.585884    3894 service.cc:154]   StreamExecutor device (0): NVIDIA A10G, Compute Capability 8.6
2024-08-07 02:51:00.675019: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-08-07 02:51:00.847610: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8905


[1m117/134[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 868us/step - accuracy: 0.6615 - loss: 3.7767

I0000 00:00:1722999062.203440    3894 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6666 - loss: 3.6759  




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.6669 - loss: 3.6704 - val_accuracy: 0.6935 - val_loss: 1.9502
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7244 - loss: 2.0777 - val_accuracy: 0.7301 - val_loss: 2.8380
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7555 - loss: 2.0405 - val_accuracy: 0.7554 - val_loss: 2.6633
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7274 - loss: 2.8781 - val_accuracy: 0.7067 - val_loss: 2.4844
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7324 - loss: 2.9547 - val_accuracy: 0.7348 - val_loss: 2.7956
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7326 - loss: 2.6526 - val_accuracy: 0.7301 - val_loss: 2.5576
Epoch 7/20
[1m134/134[0m [32m━━━━━━

[I 2024-08-07 02:51:11,034] Trial 0 finished with value: 0.7130169484692862 and parameters: {'n_layers': 1, 'n_units_l0': 477, 'dropout_rate': 0.29073284384135306, 'learning_rate': 0.007900485170528578}. Best is trial 0 with value: 0.7130169484692862.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5974 - loss: 1.5838 




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - accuracy: 0.5975 - loss: 1.5821 - val_accuracy: 0.6439 - val_loss: 1.0441
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6125 - loss: 1.1442 - val_accuracy: 0.5858 - val_loss: 1.2095
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5631 - loss: 1.3083 - val_accuracy: 0.5351 - val_loss: 1.2581
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5770 - loss: 1.1962 - val_accuracy: 0.5351 - val_loss: 1.2991
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5747 - loss: 1.2110 - val_accuracy: 0.5351 - val_loss: 1.2867
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5579 - loss: 1.2285 - val_accuracy: 0.5351 - val_loss: 1.3024
Epoch 7/20
[1m134/134[0m [32m━━━━━





[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step 


[I 2024-08-07 02:51:28,934] Trial 1 finished with value: 0.3928212599307391 and parameters: {'n_layers': 3, 'n_units_l0': 309, 'n_units_l1': 83, 'n_units_l2': 511, 'dropout_rate': 0.3653050205603694, 'learning_rate': 0.007842042668730675}. Best is trial 0 with value: 0.7130169484692862.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.6655 - loss: 1.7158 - val_accuracy: 0.7835 - val_loss: 0.7106
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7536 - loss: 0.8385 - val_accuracy: 0.7854 - val_loss: 0.7006
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7619 - loss: 0.8323 - val_accuracy: 0.7619 - val_loss: 0.7352
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7443 - loss: 0.8285 - val_accuracy: 0.7498 - val_loss: 0.7704
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7402 - loss: 0.8320 - val_accuracy: 0.7432 - val_loss: 0.7714
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7399 - loss: 0.8101 - val_accuracy: 0.7301 - val_loss: 0.8616
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:51:41,278] Trial 2 finished with value: 0.5816451673780522 and parameters: {'n_layers': 2, 'n_units_l0': 235, 'n_units_l1': 444, 'dropout_rate': 0.2807943200848955, 'learning_rate': 0.00788201241617743}. Best is trial 0 with value: 0.7130169484692862.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.5755 - loss: 2.0652 - val_accuracy: 0.7338 - val_loss: 0.8257
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6803 - loss: 1.0904 - val_accuracy: 0.5886 - val_loss: 0.9488
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6467 - loss: 1.1744 - val_accuracy: 0.6560 - val_loss: 0.8998
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6265 - loss: 1.1334 - val_accuracy: 0.6326 - val_loss: 1.0785
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6341 - loss: 1.1603 - val_accuracy: 0.5764 - val_loss: 1.1946
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6071 - loss: 1.2068 - val_accuracy: 0.5661 - val_loss: 1.2450
Epoch 7/20
[1m134/134[0m






[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 82ms/step


[I 2024-08-07 02:52:00,603] Trial 3 finished with value: 0.3928212599307391 and parameters: {'n_layers': 3, 'n_units_l0': 369, 'n_units_l1': 359, 'n_units_l2': 131, 'dropout_rate': 0.48547006827504047, 'learning_rate': 0.007051529295035562}. Best is trial 0 with value: 0.7130169484692862.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.6906 - loss: 2.3912 - val_accuracy: 0.7132 - val_loss: 1.2524
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7407 - loss: 1.1035 - val_accuracy: 0.8107 - val_loss: 0.9237
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7533 - loss: 1.0289 - val_accuracy: 0.7460 - val_loss: 1.0348
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7480 - loss: 1.3651 - val_accuracy: 0.7226 - val_loss: 1.8208
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7314 - loss: 1.4317 - val_accuracy: 0.7769 - val_loss: 1.2932
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7485 - loss: 1.4722 - val_accuracy: 0.7582 - val_loss: 1.9281
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:52:10,692] Trial 4 finished with value: 0.7669589469504712 and parameters: {'n_layers': 1, 'n_units_l0': 384, 'dropout_rate': 0.2316262031414833, 'learning_rate': 0.006428799974731112}. Best is trial 4 with value: 0.7669589469504712.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 40ms/step - accuracy: 0.6605 - loss: 1.9300 - val_accuracy: 0.8013 - val_loss: 0.7324
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7525 - loss: 0.8042 - val_accuracy: 0.7479 - val_loss: 0.7247
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7590 - loss: 0.7341 - val_accuracy: 0.7873 - val_loss: 0.6911
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7583 - loss: 0.8431 - val_accuracy: 0.7619 - val_loss: 0.8154
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7411 - loss: 0.8210 - val_accuracy: 0.7732 - val_loss: 0.9244
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7360 - loss: 0.8589 - val_accuracy: 0.7048 - val_loss: 0.9254
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:52:24,911] Trial 5 finished with value: 0.5942317622249029 and parameters: {'n_layers': 2, 'n_units_l0': 346, 'n_units_l1': 213, 'dropout_rate': 0.23717394051247603, 'learning_rate': 0.008961900686427363}. Best is trial 4 with value: 0.7669589469504712.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 40ms/step - accuracy: 0.5889 - loss: 1.2650 - val_accuracy: 0.8079 - val_loss: 0.5694
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7602 - loss: 0.7039 - val_accuracy: 0.8144 - val_loss: 0.5532
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7989 - loss: 0.6023 - val_accuracy: 0.8154 - val_loss: 0.5279
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7932 - loss: 0.5908 - val_accuracy: 0.8257 - val_loss: 0.5185
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8074 - loss: 0.5384 - val_accuracy: 0.8247 - val_loss: 0.5202
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7980 - loss: 0.5436 - val_accuracy: 0.8238 - val_loss: 0.5172
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:52:39,060] Trial 6 finished with value: 0.7871345736449633 and parameters: {'n_layers': 2, 'n_units_l0': 462, 'n_units_l1': 271, 'dropout_rate': 0.391496239477795, 'learning_rate': 0.00014162268637782355}. Best is trial 6 with value: 0.7871345736449633.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.6783 - loss: 1.3778 - val_accuracy: 0.7948 - val_loss: 0.6426
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7660 - loss: 0.7152 - val_accuracy: 0.7788 - val_loss: 0.6256
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7704 - loss: 0.6699 - val_accuracy: 0.7629 - val_loss: 0.6669
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7695 - loss: 0.7156 - val_accuracy: 0.7957 - val_loss: 0.5950
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7804 - loss: 0.6622 - val_accuracy: 0.7666 - val_loss: 0.6243
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7830 - loss: 0.6445 - val_accuracy: 0.7957 - val_loss: 0.5977
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:52:51,985] Trial 7 finished with value: 0.7517940033158698 and parameters: {'n_layers': 2, 'n_units_l0': 282, 'n_units_l1': 286, 'dropout_rate': 0.3354047338926447, 'learning_rate': 0.003984781653061122}. Best is trial 6 with value: 0.7871345736449633.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.6983 - loss: 1.1617 - val_accuracy: 0.7919 - val_loss: 0.6829
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7879 - loss: 0.6591 - val_accuracy: 0.8051 - val_loss: 0.6338
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8022 - loss: 0.5792 - val_accuracy: 0.7994 - val_loss: 0.5852
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8108 - loss: 0.5361 - val_accuracy: 0.8088 - val_loss: 0.6135
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8066 - loss: 0.5653 - val_accuracy: 0.8154 - val_loss: 0.5751
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8183 - loss: 0.4936 - val_accuracy: 0.8013 - val_loss: 0.5907
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:53:02,455] Trial 8 finished with value: 0.7791397706257758 and parameters: {'n_layers': 1, 'n_units_l0': 270, 'dropout_rate': 0.3587958401036683, 'learning_rate': 0.0019058143286411963}. Best is trial 6 with value: 0.7871345736449633.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 41ms/step - accuracy: 0.6290 - loss: 1.3881 - val_accuracy: 0.7694 - val_loss: 0.7675
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7087 - loss: 1.0266 - val_accuracy: 0.7310 - val_loss: 0.8105
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7136 - loss: 0.9799 - val_accuracy: 0.7329 - val_loss: 0.7768
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6990 - loss: 0.9787 - val_accuracy: 0.7685 - val_loss: 0.7923
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7302 - loss: 0.8998 - val_accuracy: 0.7470 - val_loss: 0.8232
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7065 - loss: 0.9477 - val_accuracy: 0.6813 - val_loss: 0.9135
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:53:17,274] Trial 9 finished with value: 0.3928212599307391 and parameters: {'n_layers': 3, 'n_units_l0': 115, 'n_units_l1': 135, 'n_units_l2': 259, 'dropout_rate': 0.35783171296437455, 'learning_rate': 0.008067179084199756}. Best is trial 6 with value: 0.7871345736449633.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6304 - loss: 1.1930 




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step - accuracy: 0.6310 - loss: 1.1908 - val_accuracy: 0.7891 - val_loss: 0.5824
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7838 - loss: 0.6572 - val_accuracy: 0.8079 - val_loss: 0.5441
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7825 - loss: 0.6110 - val_accuracy: 0.7985 - val_loss: 0.5469
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8051 - loss: 0.5624 - val_accuracy: 0.8191 - val_loss: 0.5139
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8081 - loss: 0.5265 - val_accuracy: 0.8201 - val_loss: 0.5193
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8242 - loss: 0.4802 - val_accuracy: 0.8201 - val_loss: 0.5181
Epoch 7/20
[1m134/134[0m [32m━━━━━━

[I 2024-08-07 02:53:29,803] Trial 10 finished with value: 0.7941815048681673 and parameters: {'n_layers': 2, 'n_units_l0': 512, 'n_units_l1': 474, 'dropout_rate': 0.4468147287490691, 'learning_rate': 0.00026551980594566923}. Best is trial 10 with value: 0.7941815048681673.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5594 - loss: 1.3519 





[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - accuracy: 0.5602 - loss: 1.3493 - val_accuracy: 0.7938 - val_loss: 0.5966
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7661 - loss: 0.6934 - val_accuracy: 0.8116 - val_loss: 0.5496
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7787 - loss: 0.6402 - val_accuracy: 0.8144 - val_loss: 0.5302
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7931 - loss: 0.5770 - val_accuracy: 0.8163 - val_loss: 0.5148
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8035 - loss: 0.5662 - val_accuracy: 0.8191 - val_loss: 0.5125
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8010 - loss: 0.5511 - val_accuracy: 0.8257 - val_loss: 0.5054
Epoch 7/20
[1m134/134[0m [32m━━━━━━





[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step 


[I 2024-08-07 02:53:43,590] Trial 11 finished with value: 0.7896297468667736 and parameters: {'n_layers': 2, 'n_units_l0': 503, 'n_units_l1': 493, 'dropout_rate': 0.44818899473839335, 'learning_rate': 0.00012162431199714695}. Best is trial 10 with value: 0.7941815048681673.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6606 - loss: 1.1031 





[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 39ms/step - accuracy: 0.6611 - loss: 1.1014 - val_accuracy: 0.8097 - val_loss: 0.5496
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7756 - loss: 0.6797 - val_accuracy: 0.8163 - val_loss: 0.5523
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7858 - loss: 0.6129 - val_accuracy: 0.8069 - val_loss: 0.5370
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8092 - loss: 0.5435 - val_accuracy: 0.8172 - val_loss: 0.5302
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8044 - loss: 0.5629 - val_accuracy: 0.8201 - val_loss: 0.5240
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8110 - loss: 0.5200 - val_accuracy: 0.8135 - val_loss: 0.5265
Epoch 7/20
[1m134/134[0m [32m━━━━━━





[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step 


[I 2024-08-07 02:53:57,330] Trial 12 finished with value: 0.7961813683060185 and parameters: {'n_layers': 2, 'n_units_l0': 511, 'n_units_l1': 505, 'dropout_rate': 0.4639565318456878, 'learning_rate': 0.0004228409406485573}. Best is trial 12 with value: 0.7961813683060185.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 37ms/step - accuracy: 0.6553 - loss: 1.4463 - val_accuracy: 0.7985 - val_loss: 0.6439
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7722 - loss: 0.7132 - val_accuracy: 0.8060 - val_loss: 0.6122
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7748 - loss: 0.6739 - val_accuracy: 0.7994 - val_loss: 0.6013
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7816 - loss: 0.6488 - val_accuracy: 0.7938 - val_loss: 0.6319
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7757 - loss: 0.6568 - val_accuracy: 0.7985 - val_loss: 0.5764
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7847 - loss: 0.6627 - val_accuracy: 0.8126 - val_loss: 0.6185
Epoch 7/20
[1m134/134[0m 




[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step 


[I 2024-08-07 02:54:10,515] Trial 13 finished with value: 0.7378198296700478 and parameters: {'n_layers': 2, 'n_units_l0': 429, 'n_units_l1': 406, 'dropout_rate': 0.4359508188415538, 'learning_rate': 0.002896529424542867}. Best is trial 12 with value: 0.7961813683060185.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.6766 - loss: 1.2171 - val_accuracy: 0.7976 - val_loss: 0.6267
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7666 - loss: 0.7266 - val_accuracy: 0.8116 - val_loss: 0.5922
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7977 - loss: 0.5753 - val_accuracy: 0.8088 - val_loss: 0.5541
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8097 - loss: 0.5545 - val_accuracy: 0.8116 - val_loss: 0.5192
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8067 - loss: 0.5408 - val_accuracy: 0.8032 - val_loss: 0.5715
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8110 - loss: 0.5257 - val_accuracy: 0.8116 - val_loss: 0.5409
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:54:20,214] Trial 14 finished with value: 0.7956795160966945 and parameters: {'n_layers': 1, 'n_units_l0': 191, 'dropout_rate': 0.4979503490692606, 'learning_rate': 0.0015983288561742088}. Best is trial 12 with value: 0.7961813683060185.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.6656 - loss: 1.3724 - val_accuracy: 0.8097 - val_loss: 0.6466
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7703 - loss: 0.7226 - val_accuracy: 0.8107 - val_loss: 0.5633
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7936 - loss: 0.5896 - val_accuracy: 0.7948 - val_loss: 0.5560
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7948 - loss: 0.5578 - val_accuracy: 0.8079 - val_loss: 0.5715
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8180 - loss: 0.5359 - val_accuracy: 0.7882 - val_loss: 0.6061
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8194 - loss: 0.5087 - val_accuracy: 0.8247 - val_loss: 0.5436
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:54:29,580] Trial 15 finished with value: 0.7848788969824153 and parameters: {'n_layers': 1, 'n_units_l0': 178, 'dropout_rate': 0.49357739532310296, 'learning_rate': 0.001976978759382274}. Best is trial 12 with value: 0.7961813683060185.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.6666 - loss: 1.4074 - val_accuracy: 0.7619 - val_loss: 0.7255
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7680 - loss: 0.6751 - val_accuracy: 0.8107 - val_loss: 0.5810
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7772 - loss: 0.6381 - val_accuracy: 0.8069 - val_loss: 0.5243
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7770 - loss: 0.6343 - val_accuracy: 0.7985 - val_loss: 0.6200
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7917 - loss: 0.6017 - val_accuracy: 0.8088 - val_loss: 0.5983
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8057 - loss: 0.5642 - val_accuracy: 0.8088 - val_loss: 0.5704
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:54:38,319] Trial 16 finished with value: 0.7633267435076767 and parameters: {'n_layers': 1, 'n_units_l0': 71, 'dropout_rate': 0.4199948135611185, 'learning_rate': 0.0050907310362635055}. Best is trial 12 with value: 0.7961813683060185.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.6832 - loss: 1.2286 - val_accuracy: 0.8060 - val_loss: 0.6111
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7744 - loss: 0.6778 - val_accuracy: 0.7882 - val_loss: 0.6263
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7958 - loss: 0.5805 - val_accuracy: 0.8154 - val_loss: 0.5707
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8172 - loss: 0.5251 - val_accuracy: 0.7994 - val_loss: 0.5837
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7983 - loss: 0.5703 - val_accuracy: 0.8191 - val_loss: 0.5542
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8271 - loss: 0.4848 - val_accuracy: 0.8266 - val_loss: 0.5371
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:54:47,672] Trial 17 finished with value: 0.7853987521094159 and parameters: {'n_layers': 1, 'n_units_l0': 182, 'dropout_rate': 0.4738943213518657, 'learning_rate': 0.0016217646407937147}. Best is trial 12 with value: 0.7961813683060185.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.6512 - loss: 1.2626 - val_accuracy: 0.7994 - val_loss: 0.6519
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7533 - loss: 0.7977 - val_accuracy: 0.7985 - val_loss: 0.6209
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7651 - loss: 0.7594 - val_accuracy: 0.7685 - val_loss: 0.7074
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7517 - loss: 0.7783 - val_accuracy: 0.8079 - val_loss: 0.6881
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7510 - loss: 0.7864 - val_accuracy: 0.7826 - val_loss: 0.7533
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7706 - loss: 0.7835 - val_accuracy: 0.7498 - val_loss: 0.7649
Epoch 7/20
[1m134/134[0m 






[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 48ms/step


[I 2024-08-07 02:55:04,049] Trial 18 finished with value: 0.7267600849095486 and parameters: {'n_layers': 3, 'n_units_l0': 204, 'n_units_l1': 383, 'n_units_l2': 511, 'dropout_rate': 0.40580632576895614, 'learning_rate': 0.0034580856439597497}. Best is trial 12 with value: 0.7961813683060185.


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.6631 - loss: 1.7285 - val_accuracy: 0.8135 - val_loss: 0.7503
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7637 - loss: 0.7854 - val_accuracy: 0.8041 - val_loss: 0.6198
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7686 - loss: 0.6798 - val_accuracy: 0.7938 - val_loss: 0.6679
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7828 - loss: 0.6681 - val_accuracy: 0.8088 - val_loss: 0.6604
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7790 - loss: 0.6549 - val_accuracy: 0.7966 - val_loss: 0.6028
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7794 - loss: 0.6437 - val_accuracy: 0.8154 - val_loss: 0.5883
Epoch 7/20
[1m134/134[0m 

[I 2024-08-07 02:55:11,931] Trial 19 finished with value: 0.7743571467374116 and parameters: {'n_layers': 1, 'n_units_l0': 131, 'dropout_rate': 0.4608146201791311, 'learning_rate': 0.004806784092794364}. Best is trial 12 with value: 0.7961813683060185.


{'n_layers': 2, 'n_units_l0': 511, 'n_units_l1': 505, 'dropout_rate': 0.4639565318456878, 'learning_rate': 0.0004228409406485573}
Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.6513 - loss: 1.1039 - val_accuracy: 0.8004 - val_loss: 0.5610
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7728 - loss: 0.6734 - val_accuracy: 0.8144 - val_loss: 0.5343
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7959 - loss: 0.5881 - val_accuracy: 0.7938 - val_loss: 0.5542
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8081 - loss: 0.5458 - val_accuracy: 0.8163 - val_loss: 0.5225
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8145 - loss: 0.5169 - val_accuracy: 0.8116 - val_loss: 0.5206
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3



Test F1 Score: 0.7863


In [30]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import pystac_client
import stackstac
import torch
from torchvision import transforms as v2
from box import Box
import yaml
import math
from rasterio.enums import Resampling
from tqdm import tqdm
import rasterio
import warnings
import os
import numpy as np
import rioxarray  # Make sure to import rioxarray to extend xarray

from src.model import ClayMAEModule

warnings.filterwarnings("ignore")

STAC_API = "https://earth-search.aws.element84.com/v1"
COLLECTION = "sentinel-2-l2a"

# Load the model and metadata
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ckpt = "https://clay-model-ckpt.s3.amazonaws.com/v0.5.7/mae_v0.5.7_epoch-13_val-loss-0.3098.ckpt"
torch.set_default_device(device)

torch.cuda.empty_cache()  # Clear GPU cache

# Assuming grid is a GeoDataFrame with the points
points = grid.to_crs("EPSG:4326").geometry.apply(lambda x: (x.x, x.y)).tolist()

model = ClayMAEModule.load_from_checkpoint(
    ckpt, metadata_path="configs/metadata.yaml", shuffle=False, mask_ratio=0
)
model.eval()
model = model.to(device)

metadata = Box(yaml.safe_load(open("configs/metadata.yaml")))

# Function to normalize timestamp
def normalize_timestamp(date):
    week = date.isocalendar().week * 2 * np.pi / 52
    hour = date.hour * 2 * np.pi / 24
    return (math.sin(week), math.cos(week)), (math.sin(hour), math.cos(hour))

# Function to normalize lat/lon
def normalize_latlon(lat, lon):
    lat = lat * np.pi / 180
    lon = lon * np.pi / 180
    return (math.sin(lat), math.cos(lat)), (math.sin(lon), math.cos(lon))

def to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif isinstance(data, list):
        return [to_device(v, device) for v in data]
    return data

def process_point(lon, lat, model, metadata, year, device, j):
    model.to(device)  # Ensure the model is on the correct device
    catalog = pystac_client.Client.open(STAC_API)
    search = catalog.search(
        collections=[COLLECTION],
        datetime=f"{year}-01-01/{year}-12-31",
        bbox=(lon - 1e-5, lat - 1e-5, lon + 1e-5, lat + 1e-5),
        max_items=10,
        query={"eo:cloud_cover": {"lt": 80}},
    )

    all_items = search.get_all_items()
    items = list(all_items)
    if not items:
        return None
    
    items = sorted(items, key=lambda x: x.properties.get('eo:cloud_cover', float('inf')))
    lowest_cloud_item = items[0]

    epsg = lowest_cloud_item.properties["proj:epsg"]

    poidf = gpd.GeoDataFrame(
        pd.DataFrame(),
        crs="EPSG:4326",
        geometry=[Point(lon, lat)],
    ).to_crs(epsg)

    coords = poidf.iloc[0].geometry.coords[0]

    size = 256
    gsd = 10
    bounds = (
        coords[0] - (size * gsd) // 2,
        coords[1] - (size * gsd) // 2,
        coords[0] + (size * gsd) // 2,
        coords[1] + (size * gsd) // 2,
    )

    stack = stackstac.stack(
        lowest_cloud_item,
        bounds=bounds,
        snap_bounds=False,
        epsg=epsg,
        resolution=gsd,
        dtype="float32",
        rescale=False,
        fill_value=0,
        assets=["blue", "green", "red", "nir"],
        resampling=Resampling.nearest,
    )

    stack = stack.compute()

    items = []
    dates = []
    for item in all_items:
        if item.datetime.date() not in dates:
            items.append(item)
            dates.append(item.datetime.date())

    date = str(stack.time.values)[2:11]

    output_path = os.path.join("test_data/embeddings/challenge_1/", f"stack_{lon}_{lat}_{j}.tif")
    
    # Write the stack to a TIFF file
    with rasterio.open(
            output_path, 'w',
            driver='GTiff',
            height=stack.shape[2],
            width=stack.shape[3],
            count=len(stack.band),  # Number of bands
            dtype=str(stack.dtype),
            crs=epsg,
            transform=stack.rio.transform()
        ) as tif:
        for i, band in enumerate(stack.band, start=1):
            tif.write(np.squeeze(stack.sel(band=band).values), i)

    # Reopen the file to add metadata
    with rasterio.open(output_path, "r+") as rst:
        rst.update_tags(date=date)

    platform = "sentinel-2-l2a"
    mean = []
    std = []
    waves = []
    for band in stack.band:
        mean.append(metadata[platform].bands.mean[str(band.values)])
        std.append(metadata[platform].bands.std[str(band.values)])
        waves.append(metadata[platform].bands.wavelength[str(band.values)])

    transform = v2.Compose([v2.Normalize(mean=mean, std=std)])

    datetimes = stack.time.values.astype("datetime64[s]").tolist()
    times = [normalize_timestamp(dat) for dat in datetimes]
    week_norm = [dat[0] for dat in times]
    hour_norm = [dat[1] for dat in times]

    latlons = [normalize_latlon(lat, lon)] * len(times)
    lat_norm = [dat[0] for dat in latlons]
    lon_norm = [dat[1] for dat in latlons]

    pixels = torch.from_numpy(stack.data.astype(np.float32)).to(device)
    pixels = transform(pixels)

    batch_size = 16
    num_batches = math.ceil(len(stack) / batch_size)
    
    embeddings_list = []
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(stack))
        
        batch_pixels = pixels[start_idx:end_idx].to(device)
        batch_time = torch.tensor(np.hstack((week_norm, hour_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        batch_latlon = torch.tensor(np.hstack((lat_norm, lon_norm))[start_idx:end_idx], dtype=torch.float32).to(device)
        
        batch_datacube = {
            "platform": platform,
            "time": batch_time,
            "latlon": batch_latlon,
            "pixels": batch_pixels,
            "gsd": torch.tensor(stack.gsd.values).to(device),
            "waves": torch.tensor(waves).to(device),
        }

        batch_datacube = to_device(batch_datacube, device)

        try:
            model = model.to(device)

            with torch.no_grad():
                unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
            batch_embeddings = unmsk_patch[:, 0, :].cpu().numpy()
            embeddings_list.append(batch_embeddings)
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"GPU OOM for point ({lon}, {lat}), batch {i+1}/{num_batches}. Trying CPU...")
                device = torch.device("cpu")
                batch_datacube = to_device(batch_datacube, device)
                model = model.to(device)
                with torch.no_grad():
                    unmsk_patch, _, _, _ = model.model.encoder(batch_datacube)
                batch_embeddings = unmsk_patch[:, 0, :].numpy()
                embeddings_list.append(batch_embeddings)
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            else:
                raise e

    embeddings = np.concatenate(embeddings_list, axis=0)
    return embeddings

# Specify the year for the datetime range in the search
year = 2022

# Store results in a list
results = []

# Iterate through the points and process each one
for i, point in enumerate(tqdm(points)):
    lon, lat = point
    embeddings = process_point(lon, lat, model, metadata, year, device, i)
    if embeddings is not None:
        results.append((lon, lat, embeddings, grid.loc[i, 'most_common_lulc']))

# Create a DataFrame from the results
df = pd.DataFrame(results, columns=["lon", "lat", "embeddings", "most_common_lulc"])

# Convert to a GeoDataFrame
gdf_results = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))

# Output the resulting GeoDataFrame
gdf_results.head()


100%|██████████| 336/336 [04:35<00:00,  1.22it/s]


Unnamed: 0,lon,lat,embeddings,most_common_lulc,geometry
0,-106.080869,35.424211,"[[0.040777754, -0.022916876, 0.07173511, 0.078...",11,POINT (-106.08087 35.42421)
1,-106.052673,35.42446,"[[0.03793185, 0.009665264, 0.10715726, 0.04229...",11,POINT (-106.05267 35.42446)
2,-106.024477,35.424703,"[[0.025163846, 0.013742316, 0.124567054, 0.047...",11,POINT (-106.02448 35.42470)
3,-105.99628,35.424939,"[[0.0464423, 0.015486966, 0.08057992, 0.064075...",11,POINT (-105.99628 35.42494)
4,-105.968084,35.425168,"[[0.0056447657, -0.0030673319, 0.074692115, 0....",11,POINT (-105.96808 35.42517)


In [31]:
# Detect if GPU is available
device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

# Load the model
loaded_nn = NeuralNetwork.load_model('models/land_cover_model.h5', input_shape=768, num_classes=11, device=device)

# Prepare your new data (assuming it's in the same format as your training data)
new_data = np.squeeze(gdf_results['embeddings'].tolist())
new_data = pd.DataFrame(new_data)  # Ensure the new data is in DataFrame format

# Standardize the new data using the saved scaler
scaler = joblib.load('models/scaler.joblib')
new_data_scaled = scaler.transform(new_data)

# Make predictions
new_predictions = loaded_nn.predict(new_data_scaled)

# Load the label encoder
label_encoder = joblib.load('models/label_encoder.joblib')

# If you want to convert the predictions back to the original class labels
gdf_results['pred_lulc'] = label_encoder.inverse_transform(new_predictions)



[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 141ms/step


In [32]:
print("Test set accuracy:", np.mean(gdf_results['pred_lulc']==gdf_results['most_common_lulc']))

Test set accuracy: 0.7232142857142857


In [24]:
gdf_results['pred_lulc']

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
331    1.0
332    1.0
333    1.0
334    1.0
335    1.0
Name: pred_lulc, Length: 336, dtype: float64

In [None]:
# Ignore everything after here

#### Make predictions for the given grid

In [None]:
import geopandas as gpd
import s3fs
import pandas as pd
import os

from tqdm import tqdm



# S3 bucket and prefix
s3_bucket = 'clay-worker-bucket-dev-small-tasks'
s3_prefix = '_data/gpq/86/' #2022 S2 embeddings over test set

# Initialize s3fs filesystem
fs = s3fs.S3FileSystem()

# List all the GeoParquet files in the specified S3 directory
geo_parquet_files = fs.glob(f's3://{s3_bucket}/{s3_prefix}*.gpq')

# List to store GeoDataFrames
gdfs = []

# Load each GeoParquet file into a GeoDataFrame and append to the list
for file in tqdm(geo_parquet_files):
    gdf = gpd.read_parquet("s3://"+file, storage_options={"anon": False, "client_kwargs": {"region_name": "us-west-2"}})
    gdfs.append(gdf)

# Concatenate all GeoDataFrames into a single GeoDataFrame
gdf_test_embed = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

# Display the concatenated GeoDataFrame
gdf_test_embed


100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


Unnamed: 0,embeddings,geometry
0,"[-0.10595816, 0.049139716, -0.013616198, 0.103...","POLYGON ((-106.06225 35.59601, -106.06225 35.6..."
1,"[-0.11944625, 0.04765699, 0.0012937918, 0.0989...","POLYGON ((-106.04359 35.59601, -106.04359 35.6..."
2,"[-0.1352622, 0.03729467, 0.025111463, 0.089445...","POLYGON ((-106.06225 35.61468, -106.06225 35.6..."
3,"[-0.11955425, 0.047935113, 0.012492391, 0.0986...","POLYGON ((-106.04359 35.63335, -106.04359 35.6..."
4,"[-0.110916734, 0.043892846, 0.00046651432, 0.1...","POLYGON ((-106.04359 35.61468, -106.04359 35.6..."
...,...,...
635,"[-0.10154818, 0.04700841, -0.058602504, 0.1337...","POLYGON ((-105.48359 35.76401, -105.48359 35.7..."
636,"[-0.10063921, 0.0487299, -0.056069173, 0.12934...","POLYGON ((-105.48359 35.46535, -105.48359 35.4..."
637,"[-0.119740516, 0.050605718, -0.021452673, 0.12...","POLYGON ((-105.48359 35.78268, -105.48359 35.8..."
638,"[-0.10471448, 0.04887444, -0.05049323, 0.11969...","POLYGON ((-105.48359 35.44668, -105.48359 35.4..."


In [None]:
# Initialize Earth Engine
ee.Initialize()

# Load the ESRI Global LULC dataset
lulc = ee.ImageCollection("projects/sat-io/open-datasets/landcover/ESRI_Global-LULC_10m_TS").mosaic()

def get_most_common_lulc(geometry):
    # Convert the GeoPandas geometry to an Earth Engine geometry
    ee_geometry = ee.Geometry.Rectangle(geometry.bounds)
    
    # Get the LULC values within the bounding box
    lulc_values = lulc.reduceRegion(
        reducer=ee.Reducer.frequencyHistogram(),
        geometry=ee_geometry,
        scale=10,
        maxPixels=1e9
    ).get('b1')
    
    # Find the most common LULC value
    lulc_dict = ee.Dictionary(lulc_values)
    most_common = lulc_dict.keys().sort(lulc_dict.values()).get(-1)
    
    # Return the result
    return int(most_common.getInfo())

def process_geometries(combined_gdf):
    # Get the total number of rows in the GeoDataFrame
    total_rows = len(combined_gdf)

    # Determine the number of threads to use
    max_threads = 10  # Adjust this based on your system and Earth Engine quota
    num_threads = min(total_rows, max_threads)

    results = []

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit all tasks
        future_to_index = {executor.submit(get_most_common_lulc, row.geometry): index 
                        for index, row in combined_gdf.iterrows()}
        
        # Process as they complete with a progress bar
        with tqdm(total=total_rows, desc="Processing bounding boxes") as pbar:
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    result = future.result()
                except Exception as exc:
                    print(f'Generated an exception: {exc}')
                    result = None
                results.append((index, result))
                pbar.update(1)

    # Sort results by index and extract only the values
    sorted_results = [r[1] for r in sorted(results, key=lambda x: x[0])]
    
    return sorted_results

if __name__ == '__main__':
    # Process the geometries
    results = process_geometries(gdf_test_embed.to_crs("EPSG:4326"))

    # Add the results as a new column to the GeoDataFrame
    gdf_test_embed['most_common_lulc'] = results

    # Save as GeoJSON
    gdf_test_embed[['geometry', 'most_common_lulc']].to_file("test_lulc_v2.geojson", driver="GeoJSON")

Processing bounding boxes: 100%|██████████| 640/640 [00:16<00:00, 38.75it/s]


In [None]:
# Detect if GPU is available
device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

# Load the model
loaded_nn = NeuralNetwork.load_model('models/land_cover_model.h5', input_shape=768, num_classes=11, device=device)

# Prepare your new data (assuming it's in the same format as your training data)
new_data = gdf_test_embed['embeddings'].tolist()
new_data = pd.DataFrame(new_data)  # Ensure the new data is in DataFrame format

# Standardize the new data using the saved scaler
scaler = joblib.load('models/scaler.joblib')
new_data_scaled = scaler.transform(new_data)

# Make predictions
new_predictions = loaded_nn.predict(new_data_scaled)

# Load the label encoder
label_encoder = joblib.load('models/label_encoder.joblib')

# If you want to convert the predictions back to the original class labels
gdf_test_embed['pred_lulc'] = label_encoder.inverse_transform(new_predictions)



[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 758us/step


In [None]:
print("Test set accuracy:", np.mean(gdf_test_embed['pred_lulc']==gdf_test_embed['most_common_lulc']))

Test set accuracy: 0.6546875


## Task 3: Above ground stock regression

### Retrieve S2 embeddings from an S3 bucket over California

In [None]:
import geopandas as gpd
import s3fs
import pandas as pd
import os

from tqdm import tqdm


file_path = 'models/agb_regression_model.h5'

if os.path.exists(file_path):
    print("Model already exists")
else:
    # S3 bucket and prefix
    s3_bucket = 'clay-worker-bucket-dev-small-tasks'
    s3_prefix = '_data/gpq/51/' #2022 S2 Cali embeddings

    # Initialize s3fs filesystem
    fs = s3fs.S3FileSystem()

    # List all the GeoParquet files in the specified S3 directory
    geo_parquet_files = fs.glob(f's3://{s3_bucket}/{s3_prefix}*.gpq')

    # List to store GeoDataFrames
    gdfs = []

    # Load each GeoParquet file into a GeoDataFrame and append to the list
    for file in tqdm(geo_parquet_files):
        gdf = gpd.read_parquet("s3://"+file, storage_options={"anon": False, "client_kwargs": {"region_name": "us-west-2"}})
        gdfs.append(gdf)

    # Concatenate all GeoDataFrames into a single GeoDataFrame
    combined_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

    # Display the concatenated GeoDataFrame
    combined_gdf


Model already exists


### Join above ground biomass for each chip

In [None]:
import ee
import geopandas as gpd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

if os.path.exists(file_path):
    print("Model already exists")
    combined_gdf = gpd.read_file("agb.geojson")
else:
    # Initialize Earth Engine
    ee.Initialize()

    # Load the NASA/ORNL biomass carbon density dataset
    biomass = ee.ImageCollection("NASA/ORNL/biomass_carbon_density/v1").mosaic()

    def get_agb(geometry):
        # Convert the GeoPandas geometry to an Earth Engine geometry
        ee_geometry = ee.Geometry.Polygon(list(geometry.exterior.coords))

        # Get the mean AGB value within the geometry
        agb_value = biomass.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=ee_geometry,
            scale=300,
            maxPixels=1e9
        ).get('agb')
        
        # Return the result
        return agb_value.getInfo() if agb_value is not None else None

    def process_geometries(combined_gdf):
        # Get the total number of rows in the GeoDataFrame
        total_rows = len(combined_gdf)

        # Determine the number of threads to use
        max_threads = 10  # Adjust this based on your system and Earth Engine quota
        num_threads = min(total_rows, max_threads)

        results = []

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Submit all tasks
            future_to_index = {executor.submit(get_agb, row.geometry): index 
                            for index, row in combined_gdf.iterrows()}
            
            # Process as they complete with a progress bar
            with tqdm(total=total_rows, desc="Processing geometries") as pbar:
                for future in as_completed(future_to_index):
                    index = future_to_index[future]
                    try:
                        result = future.result()
                    except Exception as exc:
                        print(f'Generated an exception: {exc}')
                        result = None
                    results.append((index, result))
                    pbar.update(1)

        # Sort results by index and extract only the values
        sorted_results = [r[1] for r in sorted(results, key=lambda x: x[0])]
        
        return sorted_results

    if __name__ == '__main__':
        # Process the geometries
        results = process_geometries(combined_gdf)

        # Add the results as a new column to the GeoDataFrame
        combined_gdf['mean_agb'] = results

        # Save as GeoJSON
        combined_gdf[['geometry', 'mean_agb']].to_file("agb.geojson", driver="GeoJSON")


Model already exists


### Tune hyperparameters for above ground biomass regressor using Optuna

In [None]:
import pandas as pd
import numpy as np
import optuna
import tensorflow as tf
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam

class NeuralNetwork:
    def __init__(self, input_shape, layers, dropout_rate, learning_rate, device):
        self.input_shape = input_shape
        self.layers = layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.device = device
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.input_shape,)))
        for layer_size in self.layers:
            model.add(Dense(layer_size, activation='relu'))
            model.add(Dropout(self.dropout_rate))
        model.add(Dense(1, activation='linear'))  # Output layer for regression
        return model

    def compile_model(self):
        optimizer = Adam(learning_rate=self.learning_rate)
        self.model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])  # Use MSE for regression

    def train_model(self, X_train, y_train, epochs=20, batch_size=32, validation_split=0.2):
        with tf.device(self.device):
            history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
        return history

    def evaluate_model(self, X_test, y_test):
        with tf.device(self.device):
            loss, mae = self.model.evaluate(X_test, y_test)
        return loss, mae

    def predict(self, X_test):
        with tf.device(self.device):
            predictions = self.model.predict(X_test)
        return predictions.flatten()

    def calculate_rmse(self, y_test, predictions):
        return np.sqrt(mean_squared_error(y_test, predictions))
    
    def save_model(self, filename):
        self.model.save(filename)

    @classmethod
    def load_model(cls, filename, input_shape, device):
        loaded_model = tf.keras.models.load_model(filename)
        nn = cls(input_shape, [], 0, 0, device)  # Dummy values for layers, dropout_rate, and learning_rate
        nn.model = loaded_model
        return nn

def objective(trial):
    layers = []
    for i in range(trial.suggest_int('n_layers', 1, 3)):
        layers.append(trial.suggest_int(f'n_units_l{i}', 64, 512))
    
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2)
    
    nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate)
    nn.compile_model()
    
    nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
    
    predictions = nn.predict(X_test)
    predictions_exp = np.exp(predictions)  # Inverse log transform predictions
    rmse = nn.calculate_rmse(y_test_exp, predictions_exp)
    
    return rmse

if os.path.exists('models/agb_regression_model.h5'):
    print("Model already exists")
else:
    # Assuming combined_gdf is already loaded
    combined_gdf_filtered = combined_gdf.dropna()
    X = combined_gdf_filtered['embeddings'].tolist()
    X = pd.DataFrame(X)
    y = combined_gdf_filtered['mean_agb']

    # Log transform the target variable
    y_log = np.log(y + 1)  # Adding 1 to avoid log(0)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Inverse transform for evaluation
    y_test_exp = np.exp(y_test) - 1  # Subtract 1 to get the original scale

    device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'

    # Optimize the hyperparameters
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=5)

    # Print the best hyperparameters
    print(study.best_params)

    # Example usage with the best hyperparameters
    best_params = study.best_params
    layers = [best_params[f'n_units_l{i}'] for i in range(best_params['n_layers'])]
    dropout_rate = best_params['dropout_rate']
    learning_rate = best_params['learning_rate']

    nn = NeuralNetwork(input_shape=X_train.shape[1], device=device, layers=layers, dropout_rate=dropout_rate, learning_rate=learning_rate)
    nn.compile_model()
    history = nn.train_model(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)
    loss, mae = nn.evaluate_model(X_test, y_test)
    print(f'Test MAE: {mae:.4f}')
    predictions = nn.predict(X_test)
    predictions_exp = np.exp(predictions) - 1  # Inverse log transform
    rmse = nn.calculate_rmse(y_test_exp, predictions_exp)
    print(f'Test RMSE: {rmse:.4f}')

    # Save the model
    nn.save_model('models/agb_regression_model.h5')

    # Save scaler
    joblib.dump(scaler, 'models/3_scaler.joblib')


  from .autonotebook import tqdm as notebook_tqdm
2024-07-26 15:29:41.264463: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-26 15:29:41.530251: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-26 15:29:41.603764: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-26 15:29:42.194784: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model already exists


In [None]:
combined_gdf_filtered

Unnamed: 0,embeddings,geometry,mean_agb
0,"[-0.1340062, 0.077923074, -0.0016371261, 0.083...","POLYGON ((-116.13800 34.50819, -116.13800 34.5...",0.336093
1,"[-0.12408799, 0.06986915, -0.0014471954, 0.080...","POLYGON ((-116.15667 34.50819, -116.15667 34.5...",0.315462
2,"[-0.13270076, 0.067874126, -0.0058731856, 0.08...","POLYGON ((-116.13800 34.48952, -116.13800 34.5...",0.308832
3,"[-0.11633471, 0.07077362, -0.009755048, 0.0875...","POLYGON ((-116.13800 34.47086, -116.13800 34.4...",0.263030
4,"[-0.1335075, 0.0719392, -0.0011160049, 0.08034...","POLYGON ((-116.11934 34.48952, -116.11934 34.5...",0.281872
...,...,...,...
125141,"[-0.09712396, 0.06625347, -0.056792237, 0.1275...","POLYGON ((-123.32467 40.51886, -123.32467 40.5...",95.744820
125142,"[-0.09359632, 0.06506389, -0.053921893, 0.1259...","POLYGON ((-123.34334 40.53752, -123.34334 40.5...",90.395423
125143,"[-0.093709834, 0.06074024, -0.07075437, 0.1278...","POLYGON ((-123.23134 39.32419, -123.23134 39.3...",95.926339
125144,"[-0.09378435, 0.059857186, -0.055076838, 0.128...","POLYGON ((-123.45534 39.10019, -123.45534 39.1...",77.902087


In [None]:
import datetime
import glob
import math
import os
import random
import requests

import geopandas as gpd
import lancedb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pystac_client
import shapely
from shapely.geometry import box, Polygon
import torch
import yaml
from box import Box
from pyproj import Transformer
from rasterio.io import MemoryFile
from stacchip.chipper import Chipper
from stacchip.indexer import Sentinel2Indexer
from stacchip.processors.prechip import normalize_timestamp
from torchvision.transforms import v2

from src.model import ClayMAEModule

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [None]:

# Optimize GDAL settings for cloud optimized reading
os.environ["GDAL_DISABLE_READDIR_ON_OPEN"] = "EMPTY_DIR"
os.environ["AWS_REQUEST_PAYER"] = "requester"

STAC_API = "https://earth-search.aws.element84.com/v1"
COLLECTION = "sentinel-2-l2a"
YEAR = 2022

# Search the catalogue
catalog = pystac_client.Client.open(STAC_API)
search = catalog.search(
    collections=[COLLECTION],
    datetime=f"{YEAR}-01-01T00:00:00Z/{YEAR+1}-01-01T00:00:00Z",
    bbox=[-123.30442043188332, 46.875515943543945, -121.65107564263492, 48.012480708758716],
    max_items=100,
    query={"eo:cloud_cover": {"lt": 80}},
)

all_items = search.get_all_items()

# Reduce to one per date (there might be some duplicates
# based on the location)
items = []
dates = []
for item in all_items:
    if item.datetime.date() not in dates:
        items.append(item)
        dates.append(item.datetime.date())

print(f"Found {len(items)} items")



Found 23 items


In [None]:
chips = []
datetimes = []
bboxs = []
chip_ids = []
item_ids = []

for item in items:
    print(f"Working on {item}")

    # Index the chips in the item
    indexer = Sentinel2Indexer(item)

    # Instanciate the chipper
    chipper = Chipper(indexer, assets=["red", "green", "blue", "nir", "scl"])

    # Get first chip for the "image" asset key
    for idx, (x, y, chip) in enumerate(chipper):
        if idx > 2:
            break
        del chip["scl"]
        chips.append(chip)
        datetimes.append(item.datetime)
        bboxs.append(indexer.get_chip_bbox(x, y))
        chip_ids.append((x, y))
        item_ids.append(item.id)

Working on <Item id=S2B_10TDS_20221231_0_L2A>
Working on <Item id=S2B_10TDT_20221228_0_L2A>
Working on <Item id=S2A_10TFS_20221226_0_L2A>
Working on <Item id=S2B_10TDS_20221221_0_L2A>
Working on <Item id=S2B_10TFS_20221218_0_L2A>
Working on <Item id=S2A_10TDS_20221216_0_L2A>
Working on <Item id=S2A_10TES_20221213_0_L2A>
Working on <Item id=S2B_10TDT_20221211_0_L2A>
Working on <Item id=S2A_10TFS_20221206_0_L2A>
Working on <Item id=S2A_10TES_20221203_0_L2A>
Working on <Item id=S2B_10TDT_20221201_0_L2A>
Working on <Item id=S2B_10TDS_20221128_0_L2A>
Working on <Item id=S2A_10TDS_20221126_0_L2A>
Working on <Item id=S2A_10TDS_20221123_0_L2A>
Working on <Item id=S2B_10TDS_20221121_0_L2A>
Working on <Item id=S2B_10TDS_20221118_0_L2A>
Working on <Item id=S2A_10TDS_20221116_0_L2A>
Working on <Item id=S2A_10TFS_20221113_0_L2A>
Working on <Item id=S2B_10TFS_20221111_0_L2A>
Working on <Item id=S2B_10TDS_20221108_0_L2A>
Working on <Item id=S2A_10TDT_20221106_0_L2A>
Working on <Item id=S2B_10TFS_2022

In [None]:
pixels = np.array([np.array(list(chip.values())).squeeze() for chip in chips])
pixels.shape

(69, 4, 224, 224)

In [None]:
# Extract mean, std, and wavelengths from metadata
platform = "sentinel-2-l2a"
# Retrieve the file content from the URL

url = (
    "https://raw.githubusercontent.com/Clay-foundation/model/main/configs/metadata.yaml"
)
response = requests.get(url, allow_redirects=True)

# Convert bytes to string
content = response.content.decode("utf-8")

# Load the yaml
content = yaml.safe_load(content)

metadata = Box(content)
mean = []
std = []
waves = []
# Use the band names to get the correct values in the correct order.
for band in chips[0].keys():
    mean.append(metadata[platform].bands.mean[band])
    std.append(metadata[platform].bands.std[band])
    waves.append(metadata[platform].bands.wavelength[band])

# Prepare the normalization transform function using the mean and std values.
transform = v2.Compose(
    [
        v2.Normalize(mean=mean, std=std),
    ]
)