# Data Loading/Fetching

In [None]:
import subprocess
import sys
from pathlib import Path

In [None]:
BASE_DIR = Path.cwd()
REPO_DIR = BASE_DIR / 'planet-understanding-the-amazon-from-space'

In [None]:
if not REPO_DIR.is_dir():
    subprocess.run(["kaggle datasets download paultimothymooney/open-elections-data-usa"])

In [5]:
# Expecting ~/.kaggle/kaggle.json to exist

!kaggle competitions download -c "planet-understanding-the-amazon-from-space"

Downloading planet-understanding-the-amazon-from-space.zip to /mnt/c/Users/bills/Desktop/Amazon_Rainforest
100%|██████████████████████████████████████| 2.94M/2.94M [00:01<00:00, 3.21MB/s]
100%|██████████████████████████████████████| 2.94M/2.94M [00:01<00:00, 2.94MB/s]


In [None]:
import sys

sys.path.append('/content/planet-amazon-deforestation/img')

In [None]:
import os
import cv2 as cv
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Import into files "train_v2.csv", "test_v2_file_mapping.csv", "sample_submission_v2.csv"

In [None]:
#paths from kaggle data
train_path = '/content/train_v2.csv'
test_path = '/content/test_v2_file_mapping.csv'
sample_path = '/content/sample_submission_v2.csv'

In [None]:
labels_tr_df = pd.read_csv(train_path)
labels_tr_df.head()

In [None]:
labels_tst_df = pd.read_csv(test_path)
labels_tst_df.head()

In [None]:
# Print all unique tags
from itertools import chain
labels_list = list(chain.from_iterable([tags.split(" ") for tags in labels_tr_df['tags'].values]))
labels_set = set(labels_list)
print("There is {} unique labels including {}".format(len(labels_set), labels_set))
images_title = [labels_tr_df[labels_tr_df['tags'].str.contains(label)].iloc[i]['image_name'] + '.jpg'  for i, label in enumerate(labels_set)]
     

In [None]:
# Histogram of label instances
labels_s = pd.Series(labels_list).value_counts() 
fig, ax = plt.subplots(figsize=(16, 8))
sns.barplot(x=labels_s, y=labels_s.index, orient='h')

As expected, some classes are largely representated whereas some are barely present in this dataset. There is a risk that our model barely learn the rare classes or even to exclude them from the training data upon splitting between training and validating sets.

In [None]:
img_resize = (74, 74)  # nova velikost potrebna pro XCeption model
validation_split_size = 0.2
epochs = 20
batch_size = 128

#Data Preprocessing

In [None]:
import cv2
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms as T, models
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
!pip install -q torchsummary --user
from torchsummary import summary


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

##Class visualisation

Let's now observe each label invidually. Each image is mapped to a list of labels, with a total of 17 different labels. All 17 labels are "almost" independent, meaning that primary can be found along with slash burn. I said "almost" because cloudy affects visibility, so that no other label can be found in the same image.

For sake of clarity, I only displayed one label per image on the figure below. When images are associated to multiple labels, I displayed them multiple times (e.g. primary and haze).

In [None]:
!7z e train-jpg.tar.7z

In [None]:
%cd /content/
!7z x -so train-jpg.tar.7z | tar xf - -C /content/

In [None]:
path = "/content/"
path_train_jpg = os.path.join(path, "train-jpg")

In [None]:
all_tags = list(set(labels_set))
N_tags = len(all_tags)
fig, axes = plt.subplots(4, (N_tags//4)+1, figsize=(20, 20))
for idx, tag in enumerate(all_tags):
    filename = labels_tr_df.loc[labels_tr_df.tags.str.contains(tag)].image_name.values[0]
    img = cv2.imread(os.path.join(path_train_jpg, filename+".jpg"))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    idx_col = idx // 4
    idx_row = idx % 4
    axes[idx_row][idx_col].set_title(tag)
    axes[idx_row][idx_col].imshow(img)
axes[1][-1].remove()
axes[2][-1].remove()
axes[3][-1].remove()

We can make few remarks here:

Some labels like "water" or "road" are challenging to differenciate
Some rare labels like selecting logging and blooming are also hard to discriminate, and are barely visible at all
Strong correlations can be expected between labels like habitation, road and cultivations

##Transformations

Resnet18 needs input shape that are multiple of 32 and in our case we have input of size 256. From 256, the closest multiple of 32 is 224.

Therefore, we rescale our input data using this multiple, and we also normalize our dataset based on resnet pretrained mean and standard deviation intensity values. ToTensor() is useful to normalize our image values from 0-255 range to 0-1 range.

In [None]:
def get_transforms():
    transform_train = T.Compose([
      T.ToPILImage(),
      T.Resize(224),
      T.ToTensor(),
      T.Normalize(
          mean=[0.485, 0.456, 0.406],
          std=[0.229, 0.224, 0.225],
      )
    ])
    transform_val = T.Compose([
      T.ToPILImage(),
      T.Resize(224),
      T.ToTensor(),
      T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
      )
    ])
    return transform_train, transform_val