In [1]:
import pandas as pd
import numpy as np

import rasterio
from skimage.transform import resize
from skimage.transform import rotate
import os

import torch
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

from datetime import timedelta
from skimage.draw import polygon
import matplotlib.pyplot as plt

from shapely.geometry import Polygon

from utils import process_yield_data
from pathlib import Path

#### Import Yield Data

In [26]:
YIELD_DATA_PATH = Path("./combined_yield_data.csv")
yield_data_weekly = process_yield_data(YIELD_DATA_PATH)

            Volume (Pounds)  Cumulative Volumne (Pounds)  Pounds/Acre
Date                                                                 
2012-01-02          23400.0                      23400.0          2.0
2012-01-03          26064.0                      49464.0          3.0
2012-01-04          32382.0                      81846.0          3.0
2012-01-05          69804.0                     151650.0          7.0
2012-01-06          18000.0                     169650.0          2.0

Number of Yield Data Points:  3970

Column Names: Index(['Volume (Pounds)', 'Cumulative Volumne (Pounds)', 'Pounds/Acre'], dtype='object')
Number of Yield Data Points: 2879
Yield data with time features:
            Volume (Pounds)  Cumulative Volumne (Pounds)  Pounds/Acre  \
Date                                                                    
2012-03-04         525753.0                    1785843.0    18.333333   
2012-03-11        2949534.0                    4735377.0    51.666667   
2012-03-18   

#### Define the Model

In [3]:
target_shape = (512, 512)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


### Old Model

In [4]:
class CNNFeatureExtractor(nn.Module):
    def __init__(self):
        super(CNNFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        self.flattened_size = self._get_conv_output((1, *target_shape))
        self.fc1 = nn.Linear(self.flattened_size, 512)

    def _get_conv_output(self, shape):
        x = torch.rand(1, *shape)
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.pool(F.relu(self.bn4(self.conv4(x))))
        n_size = x.view(1, -1).size(1)
        return n_size

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.pool(F.relu(self.bn4(self.conv4(x))))
        x = self.dropout(x)
        x = x.view(-1, self.flattened_size)
        x = F.relu(self.fc1(x))
        return x
    
class HybridModel(nn.Module):
    def __init__(self, cnn_feature_extractor, lstm_hidden_size=64, lstm_layers=1):
        super(HybridModel, self).__init__()
        self.cnn = cnn_feature_extractor
        self.lstm = nn.LSTM(input_size=512, hidden_size=lstm_hidden_size, num_layers=lstm_layers, batch_first=True)
        self.fc1 = nn.Linear(lstm_hidden_size + 4, 64)
        self.fc2 = nn.Linear(64, target_shape[0] * target_shape[1])  # Predict a value per pixel
        self.target_shape = target_shape

    def forward(self, x, time_features):
        batch_size, time_steps, C, H, W = x.size()
        c_in = x.view(batch_size * time_steps, C, H, W)
        c_out = self.cnn(c_in)
        r_in = c_out.view(batch_size, time_steps, -1)
        r_out, (h_n, c_n) = self.lstm(r_in)
        r_out = r_out[:, -1, :]
        x = torch.cat((r_out, time_features), dim=1)  # Concatenate LSTM output with time features
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.view(batch_size, *self.target_shape)  # Reshape to the target shape
        return x

#### Initialize Function

In [5]:
def weights_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

# # Instantiate model with weight decay regularization
# cnn_feature_extractor = CNNFeatureExtractor()
# model = HybridModel(cnn_feature_extractor)
# model.apply(weights_init)
# model.to(device)

batch_size = 16
epochs = 50

# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

### Functions for prediction

In [6]:
from inference_utils import (
    preprocess_image,
    compute_mean_std,
    load_evi_data_and_prepare_features,
    find_closest_date,
    find_closest_date_in_df,
    mask_evi_data,
    predict,
    predict_weekly_yield,
    augment_image,
    prepare_dataset,
    train_and_evaluate,
    sync_evi_yield_data
)


In [7]:

# Load EVI data and prepare time features
evi_data_dir = "./landsat_evi_monterey_masked"
train_loader, val_loader, mean, std = prepare_dataset(evi_data_dir, yield_data_weekly, target_shape, augment=True)

Processed file 1/83 in 4.595581s
Processed file 2/83 in 4.526565s
Processed file 3/83 in 3.917106s
Processed file 4/83 in 3.764339s
Processed file 5/83 in 3.676140s
Processed file 6/83 in 4.567551s
Processed file 7/83 in 3.621552s
Processed file 8/83 in 3.832261s
Processed file 9/83 in 4.168375s
Processed file 10/83 in 4.570174s
Processed file 11/83 in 4.064863s
Processed file 12/83 in 3.764896s
Processed file 13/83 in 4.694772s
Processed file 14/83 in 4.377803s
Processed file 15/83 in 4.709882s
Processed file 16/83 in 3.200550s
Processed file 17/83 in 3.936213s
Processed file 18/83 in 3.715976s
Processed file 19/83 in 4.364548s
Processed file 20/83 in 4.520379s
Processed file 21/83 in 4.114959s
Processed file 22/83 in 3.151981s
Processed file 23/83 in 3.616206s
Processed file 24/83 in 4.097888s
Processed file 25/83 in 4.182319s
Processed file 26/83 in 3.813179s
Processed file 27/83 in 3.882825s
Processed file 28/83 in 4.213806s
Processed file 29/83 in 4.289558s
Processed file 30/83 in

### Model Evaluation (Cross Validation)

In [None]:

# this needs to be fixed. The loaders below are not using the time series split folds for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

mse_scores = []
rmse_scores = []
mae_scores = []
r2_scores = []

epochs = 50

for fold, (train_index, val_index) in enumerate(tscv.split(yield_data_weekly)):
    print(f"Fold {fold + 1}")
    print(f"       Train Index = {train_index[0]}, ..., {train_index[-1]} len={len(train_index)}")
    print(f"       Valid Index = {val_index[0]}, ..., {val_index[-1]} len={len(val_index)}")


    fold_train_subset = torch.utils.data.Subset(train_loader.dataset, train_index)
    fold_val_subset = torch.utils.data.Subset(val_loader.dataset, val_index)

    fold_train_loader = DataLoader(fold_train_subset, batch_size=4, shuffle=True)
    fold_val_loader = DataLoader(fold_val_subset, batch_size=4, shuffle=False)


    # Instantiate a new model for each fold
    model = HybridModel(CNNFeatureExtractor())
    model.apply(weights_init)
    model.to(device)

    # Set up the optimizer, scheduler, and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.MSELoss()

    # Train and evaluate the model
    # val_loss = train_and_evaluate(model, train_loader, val_loader, optimizer, scheduler, criterion, epochs, device)
    val_loss = train_and_evaluate(model, fold_train_loader, fold_val_loader, optimizer, scheduler, criterion, epochs, device)
    
    # Model evaluation on the validation set
    model.eval()
    with torch.no_grad():
        outputs_val = []
        labels_val = []
        for evi_batch, label_batch, time_features_batch in val_loader:
            evi_batch, label_batch, time_features_batch = evi_batch.to(device), label_batch.to(device), time_features_batch.to(device)
            outputs_batch = model(evi_batch, time_features_batch) # lbs/pixel
            outputs_val.extend(outputs_batch.cpu().numpy().flatten())
            label_batch = label_batch.unsqueeze(1).unsqueeze(2).expand(-1, target_shape[0], target_shape[1])
            labels_val.extend(label_batch.cpu().numpy().flatten())

    # Flatten the outputs and labels
    outputs_val = np.array(outputs_val)
    labels_val = np.array(labels_val)

    # Calculate val metrics
    mse = mean_squared_error(labels_val, outputs_val)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(labels_val, outputs_val)
    r2 = r2_score(labels_val, outputs_val)

    mse_scores.append(mse)
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

# Print results
print(f"Average MSE: {np.mean(mse_scores)}")
print(f"Average RMSE: {np.mean(rmse_scores)}")
print(f"Average MAE: {np.mean(mae_scores)}")
print(f"Average R-squared: {np.mean(r2_scores)}")

# Train on full dataset

In [8]:
# Instantiate a new model for each fold
model = HybridModel(CNNFeatureExtractor())
model.apply(weights_init)
model.to(device)

# Set up the optimizer, scheduler, and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
criterion = nn.MSELoss()

# Train and evaluate the model
val_loss = train_and_evaluate(model, train_loader, val_loader, optimizer, scheduler, criterion, epochs, device)

torch.save(model.state_dict(), "./trained-full-dataset-yield-density-no-leakage.pt")


# of samples - Training   - 510
# of samples - Validation - 128


  evi_sequence = torch.tensor(evi_sequence, dtype=torch.float32).unsqueeze(1)
100%|██████████| 128/128 [01:03<00:00,  2.01it/s]


Epoch 1, Loss: 0.02303801325973609




Validation Loss: 0.13716325513087213


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 2, Loss: 0.00021912637054555262




Validation Loss: 0.13683857419528067


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 3, Loss: 4.714069107225605e-05




Validation Loss: 0.13679931915248744


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 4, Loss: 1.2139776693148585e-05




Validation Loss: 0.13678972469642758


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 5, Loss: 2.449644118566064e-06




Validation Loss: 0.13678752846317366


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 6, Loss: 7.677901634311471e-07




Validation Loss: 0.13678681483725086


100%|██████████| 128/128 [01:00<00:00,  2.10it/s]


Epoch 7, Loss: 3.032403498042413e-07




Validation Loss: 0.13678655843250453


100%|██████████| 128/128 [01:01<00:00,  2.10it/s]


Epoch 8, Loss: 1.6920563471189976e-07




Validation Loss: 0.13678646262269467


100%|██████████| 128/128 [01:00<00:00,  2.10it/s]


Epoch 9, Loss: 1.0260573640036297e-07




Validation Loss: 0.13678642455488443


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 10, Loss: 5.5787742407435725e-08




Validation Loss: 0.1367864032217767


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 11, Loss: 2.9276606201833477e-08




Validation Loss: 0.13678638232522644


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 12, Loss: 1.2652823795644697e-08




Validation Loss: 0.1367863641353324


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 13, Loss: 4.771049414917239e-09




Validation Loss: 0.13678636751137674


100%|██████████| 128/128 [01:00<00:00,  2.12it/s]


Epoch 14, Loss: 1.5382479281230571e-09




Validation Loss: 0.13678635086398572


100%|██████████| 128/128 [01:00<00:00,  2.12it/s]


Epoch 15, Loss: 4.786846412416397e-10




Validation Loss: 0.13678636238910258


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 16, Loss: 1.3792824101279723e-10




Validation Loss: 0.1367863685300108


100%|██████████| 128/128 [01:00<00:00,  2.12it/s]


Epoch 17, Loss: 3.7930990881324296e-11




Validation Loss: 0.1367863482737448


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 18, Loss: 1.030097532846283e-11




Validation Loss: 0.13678636791883036


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 19, Loss: 3.4419171131595274e-12




Validation Loss: 0.1367863736813888


100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


Epoch 20, Loss: 1.862709280804205e-12




Validation Loss: 0.13678635843098164


100%|██████████| 128/128 [01:00<00:00,  2.12it/s]


Epoch 21, Loss: 1.5365051628925621e-12




Validation Loss: 0.1367863556370139


100%|██████████| 128/128 [01:00<00:00,  2.12it/s]


Epoch 22, Loss: 1.483005749043814e-12
Validation Loss: 0.1367863569757901
Early stopping!




# Inference

In [9]:
import joblib

# load in model from file
# inf_model_weights = torch.load("trained-full-dataset.pt", weights_only=True)
inf_model_weights = torch.load("trained-full-dataset-yield-density-no-leakage.pt", weights_only=True)
inf_model = HybridModel(CNNFeatureExtractor())
inf_model.load_state_dict(inf_model_weights)
inf_model.to(device)
inf_model.eval()

scaler = joblib.load("yield_scaler.save")

In [10]:

# inf_output = inf_model(evi_val, time_features_val)

# print(f"{evi_val.shape = }")
# print(f"{time_features_val.shape = }")
# print(f"{inf_output.shape = }")

In [11]:
yield_data_weekly.iloc[0].name

Timestamp('2012-03-04 00:00:00')

In [12]:

evi_data_dir = "./landsat_evi_monterey_masked"
dataset_loader, _, mean, std = prepare_dataset(evi_data_dir, yield_data_weekly, target_shape, augment=True, full=True)

Processed file 1/83 in 4.125449s
Processed file 2/83 in 4.246559s
Processed file 3/83 in 4.898093s
Processed file 4/83 in 4.672359s
Processed file 5/83 in 5.460909s
Processed file 6/83 in 4.038937s
Processed file 7/83 in 3.490030s
Processed file 8/83 in 4.127463s
Processed file 9/83 in 3.800272s
Processed file 10/83 in 3.523895s
Processed file 11/83 in 3.932964s
Processed file 12/83 in 3.734574s
Processed file 13/83 in 4.148887s
Processed file 14/83 in 4.548136s
Processed file 15/83 in 3.748950s
Processed file 16/83 in 3.109021s
Processed file 17/83 in 4.015578s
Processed file 18/83 in 4.144617s
Processed file 19/83 in 3.917441s
Processed file 20/83 in 4.372727s
Processed file 21/83 in 4.537723s
Processed file 22/83 in 3.373776s
Processed file 23/83 in 3.701019s
Processed file 24/83 in 3.277262s
Processed file 25/83 in 3.982489s
Processed file 26/83 in 4.063423s
Processed file 27/83 in 3.969857s
Processed file 28/83 in 3.823087s
Processed file 29/83 in 4.417579s
Processed file 30/83 in

In [13]:
timestamps = torch.Tensor()
yield_labels = torch.Tensor()
predictions = torch.Tensor()

for idx, (inputs, labels, time_features, timestamp) in enumerate(dataset_loader):
    print(f"Running inference... {idx/len(dataset_loader)*100:.2f}%", end='\r')
    inputs, labels, time_features = inputs.to(device), labels.to(device), time_features.to(device)
    outputs = inf_model(inputs, time_features)
    summed_outputs = outputs.sum(dim=(1,2))

    if idx >0:
        break
    timestamps = torch.cat((timestamps, timestamp))
    yield_labels = torch.cat((yield_labels, labels.to("cpu")))
    predictions = torch.cat((predictions, summed_outputs.to("cpu")))

    # loss = criterion(outputs, labels)
    # val_loss += loss.item()

# val_loss /= len(val_loader)
# print(f'Validation Loss: {val_loss}')

Running inference... 0.62%

In [14]:
yield_labels.reshape(-1,1)

tensor([0.4385, 0.8434, 0.4935, 0.0000])

In [19]:
scaler.inverse_transform(yield_labels.reshape(-1, 1))

array([[20429036.3072927 ],
       [39287413.27969694],
       [22990931.39602876],
       [       0.        ]])

In [22]:
scaler.inverse_transform(predictions.detach().numpy().reshape(-1,1))

array([[1023663.94],
       [1020795.2 ],
       [1033442.56],
       [ 977205.5 ]], dtype=float32)

In [23]:
yield_labels

tensor([0.4385, 0.8434, 0.4935, 0.0000])

In [24]:
yield_data_weekly

Unnamed: 0_level_0,Volume (Pounds),Cumulative Volumne (Pounds),Pounds/Acre,month_sin,month_cos,day_of_year_sin,day_of_year_cos
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-03-04,0.011286,1785843.0,18.333333,1.000000e+00,6.123234e-17,0.891981,0.452072
2012-03-11,0.063317,4735377.0,51.666667,1.000000e+00,6.123234e-17,0.939856,0.341571
2012-03-18,0.102446,9507645.0,83.500000,1.000000e+00,6.123234e-17,0.974100,0.226116
2012-03-25,0.067456,12649959.0,55.000000,1.000000e+00,6.123234e-17,0.994218,0.107381
2012-04-01,0.134627,18921357.0,93.857143,8.660254e-01,-5.000000e-01,0.999917,-0.012910
...,...,...,...,...,...,...,...
2024-05-12,0.767907,682790517.0,305.285714,5.000000e-01,-8.660254e-01,0.752667,-0.658402
2024-05-19,0.787426,682790517.0,365.166667,5.000000e-01,-8.660254e-01,0.668064,-0.744104
2024-05-26,0.827681,682790517.0,329.285714,5.000000e-01,-8.660254e-01,0.573772,-0.819015
2024-06-02,0.796377,682790517.0,316.571429,1.224647e-16,-1.000000e+00,0.471160,-0.882048


In [None]:
timestamps, yield_labels, predictions

In [None]:
yield_data_weekly

In [None]:
out_df = pd.DataFrame(data={"timestamp":timestamps.to_numpy(), "prediction":predictions.to_numpy(), "truth":yield_labels.to_numpy()})
out_df.to_csv("out.csv")