### Setup environment ###

Here we import all necessary modules for our model including our own python modules.

In [None]:
!pip install rasterio



In [None]:
import os
import re
import csv
import glob
import torch
import torchvision
import numpy as np
import rasterio as rio
import torch.nn.functional as F

from datetime import datetime
from torch import nn, optim
from sklearn.metrics import classification_report
from tqdm import tqdm
from torch.utils import data
from torch import nn

# init deterministic seed
seed_value = 1234
np.random.seed(seed_value)  # set numpy seed
torch.manual_seed(seed_value)  # set pytorch seed CPU

<torch._C.Generator at 0x7f4ca76e24d0>

## Define directory paths ##

We define the directories in which models are stored, the raw data can be found and the submission data is stored.

In [None]:
%%capture
!wget https://madm.dfki.de/files/sentinel/EuroSATallBands.zip
!unzip /content/EuroSATallBands.zip
!mkdir models

In [None]:
MODELS_PATH = "models"
DATA_PATH = "ds/images/remote_sensing/otherDatasets/sentinel_2/tif"

## Data Loading ##

For the data loading we use a custom torch dataset which loads data into memory. We also integrate normalization (mean 0, std 1) into the data loading.

In [None]:
# Helper mappings between classes and integers
classes_to_int = {
    "AnnualCrop": 0,
    "Forest": 1,
    "HerbaceousVegetation": 2,
    "Highway": 3,
    "Industrial": 4,
    "Pasture": 5,
    "PermanentCrop": 6,
    "Residential": 7,
    "River": 8,
    "SeaLake": 9,
}
classes_to_label = {
    0: "AnnualCrop",
    1: "Forest",
    2: "HerbaceousVegetation",
    3: "Highway",
    4: "Industrial",
    5: "Pasture",
    6: "PermanentCrop",
    7: "Residential",
    8: "River",
    9: "SeaLake",
}

# Create normalizer for 12 bands with precomputed means and standard deviations across all bands
means_tuple = (
    1353.7269257269966,
    1117.2022923538773,
    1041.8847248444733,
    946.5542548737702,
    1199.1886644965277,
    2003.0067999222367,
    2374.008444688585,
    2301.2204385489003,
    732.1819500777633,
    1820.6963775318286,
    1118.2027229275175,
    2599.7829373281975,
)
stds_tuple = (
    65.29657739037496,
    153.77375864458085,
    187.69931299271406,
    278.1246366855392,
    227.92409611864002,
    355.9331571735718,
    455.13290021052626,
    530.7795614455541,
    98.92998227431653,
    378.16138952053035,
    303.10651348740964,
    502.16376466306053
)
train_normalizer = torchvision.transforms.Normalize(means_tuple, stds_tuple)


#In-memory dataset
class InMemoryDataset(torch.utils.data.Dataset):
    def __init__(self, samples, normalizer=train_normalizer):
        self.x = []
        self.y = []
        for sample in tqdm(samples, desc="Loading training samples"):
            # Extract bands
            with rio.open(sample, "r") as d:
                img = d.read([1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13])
            tens = torch.tensor(img.astype(int))

            # Normalize
            tens = normalizer(tens.float())

            # Extract label
            label = sample.split("/")[-1].split("_")[0]
            label_id = classes_to_int[label]
            self.x.append(tens)
            self.y.append(label_id)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [None]:
def create_normalizer_for_dataset(dataset, verbose: bool = False) -> torchvision.transforms.transforms.Normalize:
    """
    Method returning a normalizer which sets mean to 0 and std to 1 for dataset
    :param dataset: Dataset to compute statistics for the normalizer from
    :param verbose: set True if you want to print the mean and std vectors
    :return: normalizer
    """
    # ONLY EXECUTE IF NEEDED: Compute means and Standard deviation for all bands across all images
    band_means = {}
    band_stds = {}
    # Data needs to be not normalized for this computation
    for x in dataset.x:
        means = torch.mean(x.float(), dim=(1, 2))
        stds = torch.std(x.float(), dim=(1, 2))

        for i, mean in enumerate(means):
            band_means[i] = band_means.get(i, 0) + float(mean)

        for i, std in enumerate(stds):
            band_stds[i] = band_stds.get(i, 0) + float(std)

    means_tuple = tuple()
    for value in band_means.values():
        means_tuple += (value / len(dataset.x),)

    stds_tuple = tuple()
    for value in band_stds.values():
        stds_tuple += (value / len(dataset.x),)
    if verbose:
        print(means_tuple)
        print(stds_tuple)

    normalizer = torchvision.transforms.Normalize(means_tuple, stds_tuple)
    return normalizer

In [None]:
# Get a list of all available files for training
samples = glob.glob(os.path.join(DATA_PATH, "*", "*.tif"))
print(len(samples))

# Load data into custom torch data set
dataset = InMemoryDataset(samples)
# Uncomment to create normalizer from scratch
# dataset = InMemoryDataset(samples, lambda x:x)
# normalizer = create_normalizer_for_dataset(dataset, verbose=True)
# del dataset
# dataset = InMemoryDataset(samples, normalizer)

### Load Training Data ###

In [None]:
print(f"length of dataset: {len(dataset)}")

batch_size = 128
train_dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
)

In [None]:
# Check shape of sample
next(iter(train_dataloader))[0].shape

## Define Model ##

In this section we define our model for the challenge.

In [None]:
class Net(nn.Module):
    """
    Model used for ML-Challenge
    """
    def __init__(self):
        """
        Model definition
        """
        super().__init__()
        self.conv1 = nn.Conv2d(12, 24, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(24, 72, 5)

        self.fc1 = nn.Linear(72 * 13 * 13, 512)
        self.fc2 = nn.Linear(512, 124)
        self.fc3 = nn.Linear(124, 10)

    def forward(self, x):
        """
        Model forward pass
        :param x: List of image samples
        :return:
        """
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
net = Net()

num_params = 0
for param in net.parameters():
    num_params += param.numel()

print("Number of to be trained model parameters: {}.".format(num_params))

Number of to be trained model parameters: 6345886.


In [None]:
# set cpu or gpu enabled device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type

# init deterministic GPU seed
torch.cuda.manual_seed(seed_value)

# log type of device enabled
print('[LOG] notebook with {} computation enabled'.format(str(device)))

[LOG] notebook with cuda computation enabled


In [None]:
!nvidia-smi

Sat May 21 14:17:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
net = net.to(device)

## Train Model ##

In this section we train our model with a cross-entropy loss utilizing stochasting gradiednt descent.

In [None]:
# Define optimization
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
criterion = criterion.to(device)

In [None]:
train_epoch_losses = []
validation_epoch_losses = []

epochs = 21
for epoch in range(epochs):  # loop over the dataset multiple times

    # init collection of mini-batch losses
    train_mini_batch_losses = []
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_dataloader, 0):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # collect mini-batch reconstruction loss
        train_mini_batch_losses.append(loss.data.item())
            
    # Per epoch store the training... 
    train_epoch_loss = np.mean(train_mini_batch_losses)
    train_epoch_losses.append(train_epoch_loss)

    # ...print statistics, ...
    now = datetime.utcnow().strftime("%Y%m%d-%H:%M:%S")
    print(f"[LOG {now}] epoch: {epoch+1} train-loss: {train_epoch_loss}")
    # ...and save the model every 10 epochs
    if (epoch) % 10 == 0 and epoch != 0:
        if not os.path.exists(MODELS_PATH):
            os.mkdir(MODELS_PATH)
        print("Saving model")
        torch.save(
            net.state_dict(), os.path.join(MODELS_PATH, f"new_model_{epoch}.pth")
        )
print("Finished Training")

[LOG 20220521-14:17:31] epoch: 1 train-loss: 1.2495655044560183
[LOG 20220521-14:17:38] epoch: 2 train-loss: 0.7225492377699269
[LOG 20220521-14:17:45] epoch: 3 train-loss: 0.5199851150761283
[LOG 20220521-14:17:51] epoch: 4 train-loss: 0.4070551106573846
[LOG 20220521-14:17:58] epoch: 5 train-loss: 0.35387418210788923
[LOG 20220521-14:18:04] epoch: 6 train-loss: 0.31791704674185167
[LOG 20220521-14:18:11] epoch: 7 train-loss: 0.2868622368757759
[LOG 20220521-14:18:17] epoch: 8 train-loss: 0.26557582051833095
[LOG 20220521-14:18:24] epoch: 9 train-loss: 0.23800654451570238
[LOG 20220521-14:18:31] epoch: 10 train-loss: 0.2165242787362275
[LOG 20220521-14:18:37] epoch: 11 train-loss: 0.19528956457054444
Saving model
[LOG 20220521-14:18:44] epoch: 12 train-loss: 0.18068273457305692
[LOG 20220521-14:18:51] epoch: 13 train-loss: 0.16692306932891715
[LOG 20220521-14:18:58] epoch: 14 train-loss: 0.16431763231471816
[LOG 20220521-14:19:04] epoch: 15 train-loss: 0.1416901783638091
[LOG 20220521

## Create submission ##

#### Load Model

In [None]:
!ls models

new_model_10.pth  new_model_20.pth


In [None]:
selected_model_path = "new_model_20.pth"
net = Net()
net.load_state_dict(torch.load(os.path.join(MODELS_PATH, selected_model_path)))

<All keys matched successfully>

#### Load submission data

In [None]:
!git clone https://github.com/rbngz/submission_dataset


Cloning into 'submission_dataset'...
remote: Enumerating objects: 4235, done.[K
remote: Total 4235 (delta 0), reused 0 (delta 0), pack-reused 4235[K
Receiving objects: 100% (4235/4235), 301.48 MiB | 12.05 MiB/s, done.
Checking out files: 100% (4232/4232), done.


In [None]:
SUBMISSION_DATA_PATH = "submission_dataset/testset"

In [None]:
submission_means_tuple = (
    380.17328711583616,
    400.1497676971955,
    628.8646132355601,
    578.870857455104,
    943.4272711885449,
    1826.2433534560898,
    2116.6662455740857,
    2205.972884006897,
    2266.934157142567,
    1487.6910683644517,
    959.236167229867,
    2281.1860589241937
)
submission_stds_tuple = (
    115.17434877174112,
    209.14842754591166,
    241.20653977105658,
    301.1056228200069,
    269.5139533673432,
    420.2497496130561,
    503.8183661547185,
    598.040304209199,
    403.93781724898935,
    398.143166872752,
    342.44079144555366,
    529.4133153492427
)
submission_normalizer = torchvision.transforms.Normalize(
    submission_means_tuple, submission_stds_tuple
)

class SubmissionDataset(torch.utils.data.Dataset):
    def __init__(self, submission_samples, normalizer=submission_normalizer):
        self.x = []
        for _, submission_sample in tqdm(
            sorted(
                {
                    # Sort files by index
                    int(re.findall("\d+", submission_sample)[0]): submission_sample
                    for submission_sample in submission_samples
                }.items()
            ),
            desc="Loading submission samples"
        ):
            # Extract bands
            img = np.load(submission_sample)

            # SWAP BANDS
            tmp = img[:, :, 8].copy()
            img = np.delete(img, 8, axis=2)
            img = np.insert(img, 11, tmp, axis=2)

            tens = torch.from_numpy(img.astype(int))
            tens = tens.permute(2, 1, 0)

            # Normalize
            tens = normalizer(tens.float())
            self.x.append(tens)

    def __getitem__(self, index):
        return self.x[index]

    def __len__(self):
        return len(self.x)

In [None]:
submission_testset_samples = glob.glob(os.path.join(SUBMISSION_DATA_PATH, "*.npy"))
submission_dataset = SubmissionDataset(submission_testset_samples)
# Uncomment to create normalizer from scratch
# submission_dataset = SubmissionDataset(submission_testset_samples,lambda x:x)
# sub_normalizer = create_normalizer_for_dataset(submission_dataset, verbose=True)
# del submission_dataset
# submission_dataset = SubmissionDataset(submission_testset_samples, sub_normalizer)

submission_dataloader = torch.utils.data.DataLoader(
    submission_dataset,
    batch_size=1,
    shuffle=False,
)

Loading submission samples: 100%|██████████| 4232/4232 [00:04<00:00, 970.15it/s]


#### Create submission ####

In [None]:
def create_submission(net, submission_dataloader, filename: str = "submission.csv"):
    """
    Helper method which creates a Kaggle submission from a given model and
    :param net:
    :param submission_dataloader:
    :param filename:
    :return:
    """
    submission_results = []

    index = 0
    with torch.no_grad():
        for images in iter(submission_dataloader):
            outputs = net(images)
            _, predicted = torch.max(outputs, 1)
            label = classes_to_label[int(predicted[0])]
            # Print predicted sample every once in a while
            if index % 1000 == 0:
                print(f"Predicted: {label}")
            submission_results.append([index, label])
            index += 1

    # field names
    fields = ["test_id", "label"]

    # writing to csv file
    with open(filename, "w") as csvfile:
        # creating a csv writer object
        csv_writer = csv.writer(csvfile)

        # writing the fields
        csv_writer.writerow(fields)

        # writing the data rows
        csv_writer.writerows(submission_results)
    print(f"Submission was written to ./{filename}")

In [None]:
create_submission(net, submission_dataloader)

Predicted: Highway
Predicted: Highway
Predicted: River
Predicted: Pasture
Predicted: SeaLake
Submission was written to ./submission.csv


In [None]:
import pandas as pd
df = pd.read_csv("submission.csv")
df["label"].value_counts()

SeaLake                 1012
PermanentCrop            714
Highway                  572
River                    408
AnnualCrop               393
HerbaceousVegetation     365
Pasture                  335
Industrial               180
Forest                   149
Residential              104
Name: label, dtype: int64