# Model training 🏋
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nasaharvest/openmapflow/blob/main/crop-mask-example/notebooks/train.ipynb)

**Description:** Stand alone notebook for training crop-mask models. 

# 1. Setup

If you don't already have one, obtain a Github Personal Access Token using the steps [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token). Save this token somewhere private.

In [None]:
email = input("Github email: ")
username = input("Github username: ")

!git config --global user.email $username
!git config --global user.name $email

from getpass import getpass
token = getpass('Github Personal Access Token:')

# TODO: Generate below two lines from config
!git clone https://$username:$token@github.com/nasaharvest/openmapflow.git
!cd openmapflow && pip install -r requirements.txt -q
%cd openmapflow/crop-mask-example

In [None]:
from google.colab import auth
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from torch import nn

import torch
import wandb
import sys
sys.path.append("..")

from openmapflow.config import RELATIVE_PATHS, FULL_PATHS
from openmapflow import PyTorchDataset
from openmapflow.config import SUBSET

from datasets import datasets

# 2. Download latest data

In [None]:
for path_key in tqdm(["models", "processed", "compressed_features"]):
    !dvc pull {RELATIVE_PATHS[path_key]} -q

!tar -xzf {RELATIVE_PATHS["compressed_features"]} -C data

In [None]:
# Currently available models
sorted([p.stem for p in FULL_PATHS["models"].glob('*.pt')])

In [None]:
# Available datasets for training and evaluation
!cat data/datasets.txt

# 3. Train model

In [None]:
model_name = input("Model name: ")

In [None]:
df = datasets[0].load_labels()

In [None]:
class LSTMClassifier(torch.nn.Module):
  def __init__(self, input_size=18, hidden_size=128):
    super(LSTMClassifier, self).__init__()
    self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
    self.linear = nn.Linear(hidden_size, 1)

  def forward(self, x):
    out, hidden = self.lstm(x)
    out = self.linear(out[:, -1]).squeeze(dim=1)
    return torch.sigmoid(out)

In [None]:
# ------------ Model -----------------------------------------
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier()
model = model.to(device)

# ------------ Optimizer -------------------------------------
lr = 0.001
params_to_update = model.parameters()
optimizer = torch.optim.SGD(params_to_update, lr=lr, momentum=0.9)
criterion = torch.nn.BCELoss()


# ------------ Dataloaders -------------------------------------
batch_size = 64
train_data = PyTorchDataset(df=df[df[SUBSET] == "training"], subset="training")
test_data = PyTorchDataset(df=df[df[SUBSET] != "training"], subset="test")
dataloaders = {
    "train": DataLoader(train_data, batch_size=batch_size, shuffle=True),
    "test": DataLoader(test_data, batch_size=batch_size, shuffle=False)
}
batch_amount = {
    "train": 1 + (len(train_data) // batch_size),
    "test": 1 + (len(test_data) // batch_size)
} 

# Train

In [None]:
#%%wandb
num_epochs = 5
run = wandb.init(project="openmapflow-crop-mask-example", config={
    "batch_size": batch_size,
    "num_epochs": num_epochs,
    "lr": lr,
    "optimizer": "SGD"
})
for epoch in range(num_epochs):

  for phase in ['train', 'test']:
    if phase == 'train':
        model.train()  # Set model to training mode
    else:
        model.eval()   # Set model to evaluate mode

    running_loss = 0.0
    running_corrects = 0

    # Iterate over data.
    for x in tqdm(dataloaders[phase], total=batch_amount[phase], desc=phase, leave=False):
      inputs, labels = x[0].to(device), x[1].to(device)

      # zero the parameter gradients
      optimizer.zero_grad()

      # forward
      with torch.set_grad_enabled(phase == 'train'):
          # Get model outputs and calculate loss
          outputs = model(inputs)
          loss = criterion(outputs, labels)
          preds = outputs > 0.5

          # backward + optimize only if in training phase
          if phase == 'train':
              loss.backward()
              optimizer.step()

      # statistics
      step_loss = loss.item() * inputs.size(0)
      if phase == "train":
        wandb.log({"train_loss": step_loss})

      running_loss += step_loss
      running_corrects += torch.sum(preds == labels.data)

    epoch_loss = running_loss / len(dataloaders[phase].dataset)
    epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

    wandb.log({
        f"{phase}_epoch_loss": epoch_loss,
        f"{phase}_epoch_acc": epoch_acc,
        "epoch": epoch,
    })

run.finish()

In [None]:
# TODO: Save models

In [None]:
# Newly available models
sorted([p.stem for p in FULL_PATHS["models"].glob('*.pt')])

# 4. Pushing the model to the repository

In [None]:
!dvc pull {RELATIVE_PATHS["models"]}
!dvc push

In [None]:
# Push changes to github
!git checkout -b'$model_name'
!git add .
!git commit -m 'Trained new: $model_name'
!git push --set-upstream origin "$model_name"

Create a Pull Request so the model can be merged into the main branch. When the branch is merged into main.