### We can construct a mosaic of nearby tiles using this method: https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/36738

# Import Necessary Libraries

In [81]:
import numpy as np
import os
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout, Input, Dense, Activation, BatchNormalization, Flatten
from tensorflow.keras.models import Sequential, Model
import math
import re
from PIL import Image

# Preprocess data
---

### Obtain Labels

In [2]:
train_data = pd.read_csv('data/train_v2.csv')

curr_count = 0
unique_labels = {}
multihot = {}
for line in train_data['tags'].values:
    for label in line.split():
        if label not in unique_labels:
            unique_labels[label] = curr_count
            curr_count += 1

mapping = {}

for k, v in unique_labels.items():
    mapping[k] = np.zeros(len(unique_labels))
    mapping[k][v] = 1

n_labels = len(mapping)
label2name = {v: k for k, v in unique_labels.items()}

print(label2name)

{0: 'haze', 1: 'primary', 2: 'agriculture', 3: 'clear', 4: 'water', 5: 'habitation', 6: 'road', 7: 'cultivation', 8: 'slash_burn', 9: 'cloudy', 10: 'partly_cloudy', 11: 'conventional_mine', 12: 'bare_ground', 13: 'artisinal_mine', 14: 'blooming', 15: 'selective_logging', 16: 'blow_down'}


### View Head of dataset

In [3]:
train_data.head(n = 10)

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road
5,train_5,haze primary water
6,train_6,agriculture clear cultivation primary water
7,train_7,haze primary
8,train_8,agriculture clear cultivation primary
9,train_9,agriculture clear cultivation primary road


### Auxiliary Function for multi-hotting

In [4]:
def multihot(label):
    z = np.zeros(n_labels)
    tokens = label.split(' ')
    for k in range(len(tokens)):
        z += mapping[tokens[k]]

    return z

In [96]:
# first pass, construct a list of image strips
train_path = 'data/train-jpg/'
dataset_length = len(os.listdir(train_path))

images_ds = tf.data.Dataset.list_files(f"{train_path}*", shuffle = False)
steps = math.floor(0.05 * dataset_length)

y = np.zeros(shape = (dataset_length, n_labels))
for iter, file in enumerate(os.listdir(train_path)[:dataset_length]):
    y[iter] = multihot(train_data['tags'][iter])

### Split into a test and train set

In [100]:
train_length = math.floor(0.8 * dataset_length)
X_train, X_test = images_ds.take(train_length), images_ds.skip(train_length)

print(f"{len(images_ds) = }, {len(X_train) = }, {len(X_test) = }")

len(images_ds) = 40479, len(X_train) = 32383, len(X_test) = 8096


In [102]:
for path in X_train.take(3):
    print(f'{path = }')

path = <tf.Tensor: shape=(), dtype=string, numpy=b'data\\train-jpg\\train_0.jpg'>
path = <tf.Tensor: shape=(), dtype=string, numpy=b'data\\train-jpg\\train_1.jpg'>
path = <tf.Tensor: shape=(), dtype=string, numpy=b'data\\train-jpg\\train_10.jpg'>


### Getting the first file for testing purposes

In [103]:
test_file = None

for i in X_train.take(1):
    print(i.numpy())
    test_file = i.numpy().decode("utf-8")

b'data\\train-jpg\\train_0.jpg'


### Function to grab the label from the filename

In [104]:
def get_label(file_path):
    x = file_path.split('\\')
    file = x[-1]
    idx = int(re.findall(r'\d+', file)[0])

    return y[idx]

### Function to encode a 2-tuple of tensors from the name of the file

In [105]:
def parse_function(filename, resize = [256, 256]):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """
    label = get_label(filename)
    # Read an image from a file

    img = Image.open(filename).convert("RGB")
    img = np.asarray(img) / 255
    img = tf.convert_to_tensor(img)
    img = tf.image.resize(img, resize)

    return img, label

In [106]:
def create_dataset(filenames, labels = y, is_training=True):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """

    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_function)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(512)
    # # Fetch batches in the background while the model is training.
    # dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
train_ds = create_dataset(X_train)
val_ds = create_dataset(X_test)

### Display a target image

In [None]:
def show_image(idx, X, y):
    img = X[idx]
    print(f"{img = }")
    plt.imshow(img)
    plt.title('rinkydinky')
    plt.show()

In [None]:
i = 0
batch = None

for e in train_ds:
    batch = e
    break

# show_image(0, batch[0], batch[1])
batch[0][240]

In [None]:
print(f"{batch[0] = }")

### Define evaluation function

In [None]:
def macro_f1(y, y_hat, thresh=0.5):
    """Compute the macro F1-score on a batch of observations (average F1 across labels)
    
    Args:
        y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        thresh: probability value above which we predict positive
        
    Returns:
        macro_f1 (scalar Tensor): value of macro F1 for the batch
    """
    y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y), axis=0), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y, axis=0), tf.float32)
    f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    macro_f1 = tf.reduce_mean(f1)
    return macro_f1

# Construct model
---

In [None]:
ds_model = Sequential()

ds_model.add(Conv2D(filters = 28,
    kernel_size = (3, 3),
    input_shape = (256, 256, 3),
    activation='relu',
    padding = 'Same'))
ds_model.add(MaxPooling2D(pool_size = (2, 2)))

ds_model.add(Conv2D(filters = 28,
    kernel_size = (3, 3),
    activation='relu'))
ds_model.add(MaxPooling2D(pool_size = (2, 2)))

ds_model.add(Conv2D(filters = 28,
    kernel_size = (3, 3),
    activation='relu'))
ds_model.add(MaxPooling2D(pool_size = (2, 2)))

ds_model.add(Flatten())

ds_model.add(Dense(200, activation = 'relu'))
ds_model.add(Dropout(0.2))

ds_model.add(Dense(100, activation = 'relu'))
ds_model.add(Dropout(0.1))

ds_model.add(Dense(n_labels, activation = 'sigmoid'))

### Compile the model

In [None]:
opt = K.optimizers.Adam(learning_rate=0.01)

ds_model.compile(optimizer=opt,
    loss = 'binary_crossentropy',
    metrics=[macro_f1])

# Train model
---

In [None]:
batchsize, epochs = 32, 60

ds_history = ds_model.fit(train_ds,
    epochs = epochs,
    batch_size = batchsize,
    validation_data = val_ds,
    verbose = 1)

# View results
---

In [None]:
i = 0
batch1 = None
for batch in train_ds:
    batch1 = batch
    i += 1

    if i == 1:
        break

# NOTE: Batch1 is a TUPLE, not a tensor.
# It's comprised of two separate tensors, where the first
# element is the set of feature tensors of dimension 512x256x256x3
# because each batch is comprised of 512 elements, each being
# 256x256x3 images.
# The second element in the tuple is the set of multihot encodings

print(f"{batch1 = }")

In [None]:
y_hat_probs = ds_model.predict(batch1[0])

In [None]:
y_hat_probs[240]

In [None]:
y[240]