### We can construct a mosaic of nearby tiles using this method: https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/36738

# Construct image tile mosaic
---

In [42]:
# first pass, construct a list of image strips
import matplotlib
from matplotlib import pyplot as plt
import os
import numpy as np

train_path = 'data/train-jpg/'

num_images = len(os.listdir(train_path))

image_strip_list = []

num_jpgs = num_images
for iter, file in enumerate(os.listdir(train_path)[:num_jpgs]):
    # print(f"iter = {iter}")
    # load image as numpy array
    image = matplotlib.image.imread(train_path + file)

    # drop alpha channel
    image = image[:, :, :3]

    # 1) Get vectors of all edges of images. So 4 vectors of size 3*256 for each image.
    left = image[:, 0, :]
    right = image[:, 255, :]
    top = image[0, :, :]
    bottom = image[255, :, :]

    # append to list containing 4-tuples of image strips: (left, right, top, bottom)
    image_strip_list.append( (left, right, top, bottom) )

    if iter % 500 == 0:
        print(f"{(100 * iter / num_jpgs):.2f} % complete")

0.00 % complete
1.24 % complete
2.47 % complete
3.71 % complete
4.94 % complete
6.18 % complete
7.41 % complete
8.65 % complete
9.88 % complete
11.12 % complete
12.35 % complete
13.59 % complete
14.82 % complete
16.06 % complete
17.29 % complete
18.53 % complete
19.76 % complete
21.00 % complete
22.23 % complete
23.47 % complete
24.70 % complete
25.94 % complete
27.17 % complete
28.41 % complete
29.65 % complete
30.88 % complete
32.12 % complete
33.35 % complete
34.59 % complete
35.82 % complete
37.06 % complete
38.29 % complete
39.53 % complete
40.76 % complete
42.00 % complete
43.23 % complete
44.47 % complete
45.70 % complete
46.94 % complete
48.17 % complete
49.41 % complete
50.64 % complete
51.88 % complete
53.11 % complete
54.35 % complete
55.58 % complete
56.82 % complete
58.05 % complete
59.29 % complete
60.53 % complete
61.76 % complete
63.00 % complete
64.23 % complete
65.47 % complete
66.70 % complete
67.94 % complete
69.17 % complete
70.41 % complete
71.64 % complete
72.88 

In [17]:
# second pass, find neighboring images based on their strips

# In this algorithm, we assume that images have not been rotated

# 2) For each image find difference between top vector and bottom vector of all other images. Take the minimum. It will be the TOP neighbour. Do the same for BOTTOM, LEFT and RIGHT.

# my idea: construct a dictionary where: 
#   * key is an integer representing the image name
#   * value is a 4-list of 2-lists
#       * 4-list: [ [left, diff.], [right, diff.], [top, diff.], [bottom, diff.] ]
# The 4-list represents the current best neighboring images and their
# difference (after subtracting two image strips)

INIT_DIFF = 999999999

image_mosaic_dict = {i:[[None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF]] for i in range(num_images)}

for image_index, image in enumerate(image_strip_list):

    # iterate over image strip sides. Note:
    # strip_indices are defined as follows:
    # 0: left
    # 1: right
    # 2: top
    # 3: bottom

    for strip_index, strip in enumerate(image):

        # check to see if strip already has a neighbor
        neighbor = image_mosaic_dict[image_index][strip_index][0]
        difference = image_mosaic_dict[image_index][strip_index][1]

        for possible_index, (possible_left, possible_right, possible_top, possible_bottom) in enumerate(image_strip_list):

            # image cannot match with itself
            if image_index == possible_index:
                continue

            # strip is left, search for rights
            if strip_index == 0:
            
                # if difference is greater, take the new strip
                new_difference = np.abs(np.sum(strip - possible_right))
                if new_difference < difference:
                    # update current strip info
                    image_mosaic_dict[image_index][strip_index][0] = possible_index
                    image_mosaic_dict[image_index][strip_index][1] = new_difference

                    # update new neighboring strip info
                    image_mosaic_dict[possible_index][1][0] = image_index
                    image_mosaic_dict[possible_index][1][1] = new_difference

            # strip is right, search for lefts
            elif strip_index == 1:

                    # if difference is greater, take the new strip
                    new_difference = np.abs(np.sum(strip - possible_left))
                    if new_difference < difference:
                        # update current strip info
                        image_mosaic_dict[image_index][strip_index][0] = possible_index
                        image_mosaic_dict[image_index][strip_index][1] = new_difference

                        # update new neighboring strip info
                        image_mosaic_dict[possible_index][0][0] = image_index
                        image_mosaic_dict[possible_index][0][1] = new_difference

            # strip is top, search for bottoms
            elif strip_index == 2:

                # if difference is greater, take the new strip
                new_difference = np.abs(np.sum(strip - possible_bottom))
                if new_difference < difference:
                    # update current strip info
                    image_mosaic_dict[image_index][strip_index][0] = possible_index
                    image_mosaic_dict[image_index][strip_index][1] = new_difference

                    # update new neighboring strip info
                    image_mosaic_dict[possible_index][3][0] = image_index
                    image_mosaic_dict[possible_index][3][1] = new_difference

            # strip is bottom, search for tops
            elif strip_index == 3:
                
                # if difference is greater, take the new strip
                new_difference = np.abs(np.sum(strip - possible_top))
                if new_difference < difference:
                    # update current strip info
                    image_mosaic_dict[image_index][strip_index][0] = possible_index
                    image_mosaic_dict[image_index][strip_index][1] = new_difference

                    # update new neighboring strip info
                    image_mosaic_dict[possible_index][2][0] = image_index
                    image_mosaic_dict[possible_index][2][1] = new_difference
    
    if image_index % 125 == 0 and image_index != 0:
            print(f"{(100 * image_index / num_jpgs)} % complete")

2.5 % complete, image_index = 125
5.0 % complete, image_index = 250
7.5 % complete, image_index = 375
10.0 % complete, image_index = 500
12.5 % complete, image_index = 625
15.0 % complete, image_index = 750
17.5 % complete, image_index = 875
20.0 % complete, image_index = 1000
22.5 % complete, image_index = 1125
25.0 % complete, image_index = 1250
27.5 % complete, image_index = 1375
30.0 % complete, image_index = 1500
32.5 % complete, image_index = 1625
35.0 % complete, image_index = 1750
37.5 % complete, image_index = 1875
40.0 % complete, image_index = 2000
42.5 % complete, image_index = 2125
45.0 % complete, image_index = 2250
47.5 % complete, image_index = 2375
50.0 % complete, image_index = 2500
52.5 % complete, image_index = 2625
55.0 % complete, image_index = 2750
57.5 % complete, image_index = 2875
60.0 % complete, image_index = 3000
62.5 % complete, image_index = 3125
65.0 % complete, image_index = 3250
67.5 % complete, image_index = 3375
70.0 % complete, image_index = 3500
72

In [19]:
# second pass, find neighboring images based on their strips

# In this algorithm, we assume that images have not been rotated

# 2) For each image find difference between top vector and bottom vector of all other images. Take the minimum. It will be the TOP neighbour. Do the same for BOTTOM, LEFT and RIGHT.

# my idea: construct a dictionary where: 
#   * key is an integer representing the image name
#   * value is a 4-list of 2-lists
#       * 4-list: [ [left, diff.], [right, diff.], [top, diff.], [bottom, diff.] ]
# The 4-list represents the current best neighboring images and their
# difference (after subtracting two image strips)

INIT_DIFF = float('inf')

image_mosaic_dict = {i:[[None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF]] for i in range(num_images)}

for image_index, image in enumerate(image_strip_list):

    # iterate over image strip sides. Note:
    # strip_indices are defined as follows:
    # 0: left
    # 1: right
    # 2: top
    # 3: bottom
    # check to see if strip already has a neighbor
    neighbor = image_mosaic_dict[image_index][strip_index][0]
    difference = image_mosaic_dict[image_index][strip_index][1]

    (strip_left, strip_right, strip_top, strip_bottom) = image_strip_list[image_index]

    for possible_index, (possible_left, possible_right, possible_top, possible_bottom) in enumerate(image_strip_list):

        # image cannot match with itself
        if image_index == possible_index:
            continue

        # Comparing left side of current image with right side of possible neighbor
        new_difference = np.abs(np.sum(strip_left - possible_right))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][0][0] = possible_index
            image_mosaic_dict[image_index][0][1] = new_difference

        # Comparing right side of current image with left side of possible neighbor
        new_difference = np.abs(np.sum(strip_right - possible_left))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][1][0] = possible_index
            image_mosaic_dict[image_index][1][1] = new_difference
        
        # Comparing top side of current image with bottom side of possible neighbor
        new_difference = np.abs(np.sum(strip_top - possible_bottom))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][2][0] = possible_index
            image_mosaic_dict[image_index][2][1] = new_difference

        # Comparing bottom side of current image with top side of possible neighbor
        new_difference = np.abs(np.sum(strip_bottom - possible_top))
        if new_difference < difference:
            # update current strip info
            image_mosaic_dict[image_index][3][0] = possible_index
            image_mosaic_dict[image_index][3][1] = new_difference
    
    if image_index % 250 == 0 and image_index != 0:
            print(f"{(100 * image_index / num_jpgs)} % complete")

5.0 % complete
10.0 % complete
15.0 % complete
20.0 % complete
25.0 % complete
30.0 % complete
35.0 % complete
40.0 % complete
45.0 % complete
50.0 % complete
55.0 % complete
60.0 % complete
65.0 % complete
70.0 % complete
75.0 % complete
80.0 % complete
85.0 % complete
90.0 % complete
95.0 % complete


In [41]:
# PEOPLE ON THE KAGGLE FORUMS SAY THAT, (in general), THE .tiff IMAGES WERE NOT 
# USEFUL. That might be the same case here.

# 3) Using the same algorithm get neighbours for all TIFF images.

# 4) Remove neighbours which is different for JPG and TIFF images.

# 5) Use diagonals to additional improvements. For example check that diagonal element have same neighbours as central one without conflicts and so on…

# 6) Use CNN averaged predictions for 4 or 8 neighbours as features for central element

curr_min = float('inf')
curr = curr_neighbor = 0
for index in image_mosaic_dict:
    neighbor_index = image_mosaic_dict[index][0][0]
    if image_mosaic_dict[neighbor_index][1][0] == index and image_mosaic_dict[index][0][1] < curr_min:
        print(f"found left: curr = {index}, neighbor = {neighbor_index} w/ diff = {image_mosaic_dict[index][0][1]}")
        curr_min = image_mosaic_dict[index][0][1]
        curr, curr_neighbor = index, neighbor_index
    
    neighbor_index = image_mosaic_dict[index][1][0]
    if image_mosaic_dict[neighbor_index][0][0] == index and image_mosaic_dict[index][1][1] < curr_min:
        print(f"found right: curr = {index}, neighbor = {neighbor_index} w/ diff = {image_mosaic_dict[index][1][1]}")
        curr_min = image_mosaic_dict[index][1][1]
        curr, curr_neighbor = index, neighbor_index

    neighbor_index = image_mosaic_dict[index][2][0]
    if image_mosaic_dict[neighbor_index][3][0] == index and image_mosaic_dict[index][2][1] < curr_min:
        print(f"found top: curr = {index}, neighbor_index = {neighbor_index} w/ diff = {image_mosaic_dict[index][2][1]}")
        curr_min = image_mosaic_dict[index][2][1]
        curr, curr_neighbor = index, neighbor_index
    
    neighbor_index = image_mosaic_dict[index][3][0]
    if image_mosaic_dict[neighbor_index][2][0] == index and image_mosaic_dict[index][3][1] < curr_min:
        print(f"found bottom: curr = {index}, neighbor = {neighbor_index} w/ diff = {image_mosaic_dict[index][3][1]}")
        print(f'curr_min = {curr_min}')
        curr_min = image_mosaic_dict[index][3][1]
        curr, curr_neighbor = index, neighbor_index
    
    if index == 4999:
        break

print(f"curr = {curr}, curr_neighbor = {curr_neighbor} and curr_min = {curr_min}")
# print(image_mosaic_dict)

found top: curr = 2016, neighbor_index = 4914 w/ diff = 40555
found top: curr = 3735, neighbor_index = 4664 w/ diff = 32955
found left: curr = 4160, neighbor = 4995 w/ diff = 27306
found bottom: curr = 4292, neighbor = 4876 w/ diff = 11828
curr_min = 27306
found left: curr = 4750, neighbor = 4896 w/ diff = 11787
found top: curr = 4839, neighbor_index = 4876 w/ diff = 11549
curr = 4839, curr_neighbor = 4876 and curr_min = 11549


# Preprocess data
---

In [32]:
# extract labels from training data
import pandas as pd

train_data = pd.read_csv('data/train_v2.csv/train_v2.csv')
print(train_data['tags'].values)

unique_labels = []
for line in train_data['tags'].values:
    for label in line.split():
        if label not in unique_labels:
            unique_labels.append(label)
            print(label)

print(len(unique_labels))

['haze primary' 'agriculture clear primary water' 'clear primary' ...
 'agriculture clear primary' 'agriculture clear primary road'
 'agriculture cultivation partly_cloudy primary']
haze
primary
agriculture
clear
water
habitation
road
cultivation
slash_burn
cloudy
partly_cloudy
conventional_mine
bare_ground
artisinal_mine
blooming
selective_logging
blow_down
17


# Construct model
---

# Train model
---

# View results
---