### We can construct a mosaic of nearby tiles using this method: https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/36738

# Construct image tile mosaic
---

In [73]:
# first pass, construct a list of image strips
import matplotlib
from matplotlib import pyplot as plt
import os
import numpy as np

train_path = 'data/train-jpg/'

num_images = len(os.listdir(train_path))

image_strip_list = []

num_jpgs = 5000
for iter, file in enumerate(os.listdir(train_path)[:num_jpgs]):
    # print(f"iter = {iter}")
    # load image as numpy array

    # NOTE: POSSIBLE BOTTLENECK BELOW. READING IMAGE ON EACH ITER
    image = matplotlib.image.imread(train_path + file).astype(int)

    # drop alpha channel
    image = image[:, :, :3]

    # 1) Get vectors of all edges of images. So 4 vectors of size 3*256 for each image.
    left = image[:, 0, :]
    right = image[:, 255, :]
    top = image[0, :, :]
    bottom = image[255, :, :]

    # append to list containing 4-tuples of image strips: (left, right, top, bottom)
    image_strip_list.append( (left, right, top, bottom) )

    if iter % (0.05 * num_jpgs) == 0:
        print(f"{(100 * iter / num_jpgs):.2f} % complete")

0.00 % complete
5.00 % complete
10.00 % complete
15.00 % complete
20.00 % complete
25.00 % complete
30.00 % complete
35.00 % complete
40.00 % complete
45.00 % complete
50.00 % complete
55.00 % complete
60.00 % complete
65.00 % complete
70.00 % complete
75.00 % complete
80.00 % complete
85.00 % complete
90.00 % complete
95.00 % complete


### Utility Function for taking difference between two images

In [79]:
# second pass, find neighboring images based on their strips

# In this algorithm, we assume that images have not been rotated

# 2) For each image find difference between top vector and bottom vector of all other images. Take the minimum. It will be the TOP neighbour. Do the same for BOTTOM, LEFT and RIGHT.

# my idea: construct a dictionary where: 
#   * key is an integer representing the image name
#   * value is a 4-list of 2-lists
#       * 4-list: [ [left, diff.], [right, diff.], [top, diff.], [bottom, diff.] ]
# The 4-list represents the current best neighboring images and their
# difference (after subtracting two image strips)

INIT_DIFF = float('inf')

image_mosaic_dict = {i:[[None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF]] for i in range(num_images)}

for image_index, image in enumerate(image_strip_list):

    # iterate over image strip sides. Note:
    # strip_indices are defined as follows:
    # 0: left
    # 1: right
    # 2: top
    # 3: bottom
    # check to see if strip already has a neighbor
    difference = image_mosaic_dict[image_index][0][1]

    (strip_left, strip_right, strip_top, strip_bottom) = image_strip_list[image_index]

    for possible_index, (possible_left, possible_right, possible_top, possible_bottom) in enumerate(image_strip_list):

        # image cannot match with itself
        if image_index == possible_index:
            continue

        # Comparing left side of current image with right side of possible neighbor
        difference = image_mosaic_dict[image_index][0][1]
        new_difference = np.abs(np.sum(strip_left - possible_right))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][0][0] = possible_index
            image_mosaic_dict[image_index][0][1] = new_difference

        # Comparing right side of current image with left side of possible neighbor
        difference = image_mosaic_dict[image_index][1][1]
        new_difference = np.abs(np.sum(strip_right - possible_left))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][1][0] = possible_index
            image_mosaic_dict[image_index][1][1] = new_difference
        
        # Comparing top side of current image with bottom side of possible neighbor
        difference = image_mosaic_dict[image_index][2][1]
        new_difference = np.abs(np.sum(strip_top - possible_bottom))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][2][0] = possible_index
            image_mosaic_dict[image_index][2][1] = new_difference

        # Comparing bottom side of current image with top side of possible neighbor
        difference = image_mosaic_dict[image_index][3][1]
        new_difference = np.abs(np.sum(strip_bottom - possible_top))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][3][0] = possible_index
            image_mosaic_dict[image_index][3][1] = new_difference
    
    if image_index % (0.05 * num_jpgs) == 0 and image_index != 0:
            print(f"{(100 * image_index / num_jpgs)} % complete")

5.0 % complete
10.0 % complete
15.0 % complete
20.0 % complete
25.0 % complete
30.0 % complete
35.0 % complete
40.0 % complete
45.0 % complete
50.0 % complete
55.0 % complete
60.0 % complete
65.0 % complete
70.0 % complete
75.0 % complete
80.0 % complete
85.0 % complete
90.0 % complete
95.0 % complete


KeyboardInterrupt: 

In [8]:
# PEOPLE ON THE KAGGLE FORUMS SAY THAT, (in general), THE .tiff IMAGES WERE NOT 
# USEFUL. That might be the same case here.

# 3) Using the same algorithm get neighbours for all TIFF images.

# 4) Remove neighbours which is different for JPG and TIFF images.

# 5) Use diagonals to additional improvements. For example check that diagonal element have same neighbours as central one without conflicts and so on…

# 6) Use CNN averaged predictions for 4 or 8 neighbours as features for central element

curr_min = float('inf')
curr = curr_neighbor = 0
for index in image_mosaic_dict:
    neighbor_index = image_mosaic_dict[index][0][0]
    if image_mosaic_dict[neighbor_index][1][0] == index and image_mosaic_dict[index][0][1] < curr_min:
        print(f"found left: curr = {index}, neighbor = {neighbor_index} w/ diff = {image_mosaic_dict[index][0][1]}")
        curr_min = image_mosaic_dict[index][0][1]
        curr, curr_neighbor = index, neighbor_index
    
    neighbor_index = image_mosaic_dict[index][1][0]
    if image_mosaic_dict[neighbor_index][0][0] == index and image_mosaic_dict[index][1][1] < curr_min:
        print(f"found right: curr = {index}, neighbor = {neighbor_index} w/ diff = {image_mosaic_dict[index][1][1]}")
        curr_min = image_mosaic_dict[index][1][1]
        curr, curr_neighbor = index, neighbor_index

    neighbor_index = image_mosaic_dict[index][2][0]
    if image_mosaic_dict[neighbor_index][3][0] == index and image_mosaic_dict[index][2][1] < curr_min:
        print(f"found top: curr = {index}, neighbor_index = {neighbor_index} w/ diff = {image_mosaic_dict[index][2][1]}")
        curr_min = image_mosaic_dict[index][2][1]
        curr, curr_neighbor = index, neighbor_index
    
    neighbor_index = image_mosaic_dict[index][3][0]
    if image_mosaic_dict[neighbor_index][2][0] == index and image_mosaic_dict[index][3][1] < curr_min:
        print(f"found bottom: curr = {index}, neighbor = {neighbor_index} w/ diff = {image_mosaic_dict[index][3][1]}")
        print(f'curr_min = {curr_min}')
        curr_min = image_mosaic_dict[index][3][1]
        curr, curr_neighbor = index, neighbor_index
        
    if index % (0.05 * num_jpgs) == 0 and index != 0:
            print(f"{(100 * index / num_jpgs)} % complete")
    
    if index == num_jpgs - 1:
        break

print(f"curr = {curr}, curr_neighbor = {curr_neighbor} and curr_min = {curr_min}")
# print(image_mosaic_dict)

5.0 % complete
10.0 % complete
15.0 % complete
20.0 % complete
25.0 % complete
30.0 % complete
35.0 % complete
40.0 % complete
45.0 % complete
50.0 % complete
55.0 % complete
60.0 % complete
65.0 % complete
70.0 % complete
75.0 % complete
80.0 % complete
85.0 % complete
90.0 % complete
95.0 % complete
curr = 0, curr_neighbor = 0 and curr_min = inf


In [81]:
# Channel subtraction is the issue. For some reason, when the second operand is greater than the first, there's an
# underflow and it loops back up to 255. To Fix: can't rely on np - operation, must do element-wise subtraction
# on our own.

print(image_mosaic_dict[3694])

# pixel = 116
# channel = 1

# print(f"{image_strip_list[4790][1][pixel][channel] = }")
# print(f"{image_strip_list[3694][0][pixel][channel] = }")
# print(f"image_strip[4920][0] = {image_strip_list[4920][0][:5]}")

# print(f"diff_want = {np.subtract(image_strip_list[4790][1][pixel][channel], image_strip_list[3694][0][pixel][channel])}\n",
#     f"diff_obtained = {(image_strip_list[4790][1] - image_strip_list[4920][0])[:5]}"
# )

# print(f"test_diff = {np.array([1, 2, 3]) - np.array([4, 2, 1])}")

[[2662, 4], [558, 3], [2691, 8], [1187, 8]]


# Preprocess data
---

In [None]:
# extract labels from training data
import pandas as pd

train_data = pd.read_csv('data/train_v2.csv/train_v2.csv')
print(train_data['tags'].values)

unique_labels = []
for line in train_data['tags'].values:
    for label in line.split():
        if label not in unique_labels:
            unique_labels.append(label)
            print(label)

print(len(unique_labels))

# Construct model
---

# Train model
---

# View results
---