### We can construct a mosaic of nearby tiles using this method: https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/36738

# Import Necessary Libraries

In [2]:
import numpy as np
import os
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout, Input, Dense, Activation, BatchNormalization, Flatten
from tensorflow.keras.models import Sequential, Model
import math
import re
from PIL import Image

In [3]:
np.version.version

'1.22.1'

# Defining Constants

In [4]:
DS_BATCH_SIZE = 512
MODEL_BATCH_SIZE = 32
KERNEL_SIZE = 3
IMG_DIMS = 256
EPOCHS = 32
DATA_PATH = 'data/train-jpg/'

# Preprocess data
---

### Obtain Labels

In [5]:
train_data = pd.read_csv('data/train_v2.csv')

curr_count = 0
unique_labels = {}
multihot = {}
for line in train_data['tags'].values:
    for label in line.split():
        if label not in unique_labels:
            unique_labels[label] = curr_count
            curr_count += 1

mapping = {}

for k, v in unique_labels.items():
    mapping[k] = np.zeros(len(unique_labels))
    mapping[k][v] = 1

n_labels = len(mapping)
label2name = {v: k for k, v in unique_labels.items()}

print(label2name)

{0: 'haze', 1: 'primary', 2: 'agriculture', 3: 'clear', 4: 'water', 5: 'habitation', 6: 'road', 7: 'cultivation', 8: 'slash_burn', 9: 'cloudy', 10: 'partly_cloudy', 11: 'conventional_mine', 12: 'bare_ground', 13: 'artisinal_mine', 14: 'blooming', 15: 'selective_logging', 16: 'blow_down'}


### View Head of dataset

In [6]:
train_data.head(n = 11)

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road
5,train_5,haze primary water
6,train_6,agriculture clear cultivation primary water
7,train_7,haze primary
8,train_8,agriculture clear cultivation primary
9,train_9,agriculture clear cultivation primary road


In [7]:
mapping

{'haze': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'primary': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'agriculture': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'clear': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'water': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'habitation': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'road': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'cultivation': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'slash_burn': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'cloudy': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'partly_cloudy': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 'convention

### Auxiliary Function for multi-hotting

In [8]:
def multihot(label, iter):
    z = np.zeros(n_labels)
    tokens = label.split(' ')

    for k in range(len(tokens)):
        z += mapping[tokens[k]]

    return z

### Function to grab the label from the filename

In [9]:
def get_label(file_path):
    x = file_path.split('\\')
    file = x[-1]
    idx = int(re.findall(r'\d+', file)[0])

    return train_data['tags'][idx]

In [27]:
def readImage(filename, resize = [IMG_DIMS, IMG_DIMS]):
    full_path = DATA_PATH + filename.decode("utf-8") + '.jpg'
    print(f"{full_path = }")
    img = Image.open(full_path).convert("RGB")
    img = np.asarray(img) / 255
    img = tf.convert_to_tensor(img)
    img = tf.image.resize(img, resize)

    print(f"{img = }")

    return img

def parse_function(filename, label):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """

    # print(f"{DATA_PATH + filename + '.jpg'}")

    # image_string = tf.io.read_file(DATA_PATH + filename + '.jpg')
    # image_decoded = tf.image.decode_jpeg(image_string)
    # image_resized = tf.image.resize(image_decoded, resize)
    # image_normalized = image_resized / 255.0

    # print(f"{image_normalized = }")

    img = tf.numpy_function(readImage, [filename], tf.float32)

    # # print(f"{img = }")

    return img, label

In [28]:
# first pass, construct a list of image strips

file_paths = train_data['image_name'].values
labels_string = train_data['tags'].values
ds_train = tf.data.Dataset.from_tensor_slices((file_paths, labels_string))
ds_train = ds_train.map(parse_function)

for i in ds_train.take(2):
    print(f"{i[0][0, 0] = }")

# dataset_length = len(os.listdir(DATA_PATH))

# images_ds = tf.data.Dataset.list_files(f"{DATA_PATH}*", shuffle = False)
# steps = math.floor(0.05 * dataset_length)

# X = []
# y = np.zeros(shape = (dataset_length, n_labels))
# for iter, filename in enumerate(os.listdir(DATA_PATH)[:dataset_length]):
#     label = get_label(filename)
#     y[iter] = multihot(label, iter)
#     X.append(DATA_PATH + filename)

full_path = 'data/train-jpg/train_0.jpg'
img = <tf.Tensor: shape=(256, 256, 3), dtype=float32, numpy=
array([[[0.38039216, 0.4392157 , 0.4117647 ],
        [0.36862746, 0.4392157 , 0.4       ],
        [0.36862746, 0.4392157 , 0.39607844],
        ...,
        [0.38039216, 0.4392157 , 0.40392157],
        [0.38039216, 0.43529412, 0.40392157],
        [0.37254903, 0.42745098, 0.40392157]],

       [[0.36862746, 0.42745098, 0.4       ],
        [0.36078432, 0.42745098, 0.39215687],
        [0.36078432, 0.43137255, 0.39215687],
        ...,
        [0.37254903, 0.43137255, 0.4       ],
        [0.37254903, 0.42745098, 0.40392157],
        [0.36862746, 0.42352942, 0.40392157]],

       [[0.3529412 , 0.41568628, 0.39215687],
        [0.34901962, 0.41568628, 0.3882353 ],
        [0.3529412 , 0.42352942, 0.3882353 ],
        ...,
        [0.3647059 , 0.41960785, 0.39607844],
        [0.36078432, 0.41960785, 0.4       ],
        [0.35686275, 0.41960785, 0.4       ]],

       ...,

       [[0.3

### Split into a test and train set

In [18]:
train_length = math.floor(0.8 * dataset_length)
X_train, X_test = X[:train_length], X[train_length:]
y_train_bin, y_test_bin = y[:train_length], y[train_length:]
# X_train, X_test = images_ds.take(train_length), images_ds.skip(train_length)

print(f"{len(images_ds) = }, {len(X_train) = }, {len(X_test) = }")

len(images_ds) = 40479, len(X_train) = 32383, len(X_test) = 8096


### Getting the first file for testing purposes

In [19]:
test_file = None

for filename in X_train[:1]:
    print(f"{filename = }")
    test_file = filename

filename = 'data/train-jpg/train_0.jpg'


### Function to encode a 2-tuple of tensors from the name of the file

In [30]:
def parse_function(filename, label, resize = [IMG_DIMS, IMG_DIMS]):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """

    # image_string = tf.io.read_file(filename)
    # image_decoded = tf.image.decode_jpeg(image_string)
    # image_resized = tf.image.resize(image_decoded, [256, 256])
    # image_normalized = image_resized / 255.0

    print(f"{tf.convert_to_tensor(filename).numpy() = }")
    img = Image.open(filename).convert("RGB")
    img = np.asarray(img) / 255
    img = tf.convert_to_tensor(img)
    img = tf.image.resize(img, resize)

    print(f"{img = }")

    return img, label

In [21]:
def create_dataset(filenames, labels):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """

    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))

    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_function)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(DS_BATCH_SIZE)
    
    return dataset

In [31]:
train_ds = create_dataset(X_train, y_train_bin)
val_ds = create_dataset(X_test, y_test_bin)

AttributeError: in user code:

    File "C:\Users\notda\AppData\Local\Temp/ipykernel_29584/1512394455.py", line 13, in parse_function  *
        print(f"{tf.convert_to_tensor(filename).numpy() = }")

    AttributeError: 'Tensor' object has no attribute 'numpy'


In [56]:
for x, y in train_ds:
    print(f"{x, y = }")
    break

x, y = (<tf.Tensor: shape=(512, 256, 256, 3), dtype=float32, numpy=
array([[[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        ...,

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0

### Display a target image

In [None]:
def show_image(idx, X, y):
    img = X[idx]
    print(f"{img = }")
    plt.imshow(img)
    plt.title('rinkydinky')
    plt.show()

In [10]:
batch = None

for e in X_train:
    batch = e
    break

# show_image(0, batch[0], batch[1])
batch[0][240]

InvalidArgumentError: Index out of range using input dim 0; input has only 0 dims [Op:StridedSlice] name: strided_slice/

In [None]:
print(f"{batch[0] = }")

### Define evaluation function

In [None]:
def macro_f1(y, y_hat, thresh=0.5):
    """Compute the macro F1-score on a batch of observations (average F1 across labels)
    
    Args:
        y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        thresh: probability value above which we predict positive
        
    Returns:
        macro_f1 (scalar Tensor): value of macro F1 for the batch
    """
    y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y), axis=0), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y, axis=0), tf.float32)
    f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    macro_f1 = tf.reduce_mean(f1)
    return macro_f1

# Construct model
---

In [None]:
ds_model = Sequential()

ds_model.add(Conv2D(filters = 28,
    kernel_size = (KERNEL_SIZE, KERNEL_SIZE),
    input_shape = (IMG_DIMS, IMG_DIMS, 3),
    activation='relu',
    padding = 'Same'))
ds_model.add(MaxPooling2D(pool_size = (2, 2)))

ds_model.add(Conv2D(filters = 28,
    kernel_size = (KERNEL_SIZE, KERNEL_SIZE),
    activation='relu'))
ds_model.add(MaxPooling2D(pool_size = (2, 2)))

ds_model.add(Conv2D(filters = 28,
    kernel_size = (KERNEL_SIZE, KERNEL_SIZE),
    activation='relu'))
ds_model.add(MaxPooling2D(pool_size = (2, 2)))

ds_model.add(Flatten())

ds_model.add(Dense(200, activation = 'relu'))
ds_model.add(Dropout(0.2))

ds_model.add(Dense(100, activation = 'relu'))
ds_model.add(Dropout(0.1))

ds_model.add(Dense(n_labels, activation = 'sigmoid'))

### Compile the model

In [None]:
opt = K.optimizers.Adam(learning_rate=0.01)

ds_model.compile(optimizer=opt,
    loss = 'binary_crossentropy',
    metrics=[macro_f1])

# Train model
---

In [None]:
ds_history = ds_model.fit(train_ds,
    epochs = EPOCHS,
    batch_size = MODEL_BATCH_SIZE,
    validation_data = val_ds,
    verbose = 1)

# View results
---

In [None]:
i = 0
batch1 = None
for batch in train_ds:
    batch1 = batch
    i += 1

    if i == 1:
        break

# NOTE: Batch1 is a TUPLE, not a tensor.
# It's comprised of two separate tensors, where the first
# element is the set of feature tensors of dimension 512x256x256x3
# because each batch is comprised of 512 elements, each being
# 256x256x3 images.
# The second element in the tuple is the set of multihot encodings

print(f"{batch1 = }")

In [None]:
y_hat_probs = ds_model.predict(batch1[0])

In [None]:
y_hat_probs[240]

In [None]:
y[240]