### We can construct a mosaic of nearby tiles using this method: https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/36738

# Construct image tile mosaic
---

In [1]:
# first pass, construct a list of image strips
import matplotlib
from matplotlib import pyplot as plt
import os
import numpy as np

train_path = 'data/train-jpg/'

num_images = len(os.listdir(train_path))

image_strip_list = []

for file in os.listdir(train_path):
    # load image as numpy array
    image = matplotlib.image.imread(train_path + file)

    # drop alpha channel
    image = image[:, :, :3]

    # 1) Get vectors of all edges of images. So 4 vectors of size 3*256 for each image.
    left = image[:, 0, :]
    right = image[:, 255, :]
    top = image[0, :, :]
    bottom = image[255, :, :]

    # append to list containing 4-tuples of image strips: (left, right, top, bottom)
    image_strip_list.append( (left, right, top, bottom) )

In [26]:
# second pass, find neighboring images based on their strips

# In this algorithm, we assume that images have not been rotated

# 2) For each image find difference between top vector and bottom vector of all other images. Take the minimum. It will be the TOP neighbour. Do the same for BOTTOM, LEFT and RIGHT.

# my idea: construct a dictionary where: 
#   * key is an integer representing the image name
#   * value is a 4-list of 2-lists
#       * 4-list: [ [left, diff.], [right, diff.], [top, diff.], [bottom, diff.] ]
# The 4-list represents the current best neighboring images and their
# difference (after subtracting two image strips)

INIT_DIFF = 999999999

image_mosaic_dict = {i:[[None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF], [None, INIT_DIFF]] for i in range(num_images)}

for image_index, image in enumerate(image_strip_list):

    # iterate over image strip sides
    for side_index, side in enumerate(image):

        # check to see if side already has a neighbor
        neighbor = image_mosaic_dict[image_index][side_index][0]
        difference = image_mosaic_dict[image_index][side_index][1]
        if (neighbor, difference) != (None, INIT_DIFF):
            continue
        else:

            for possible_index, (possible_left, possible_right, possible_top, possible_bottom) in enumerate(image_strip_list):

                # image cannot match with itself
                if image_index == possible_index:
                    continue

                # side is left, search for rights
                if side_index == 0:
                
                    # if difference is greater, take the new side
                    new_difference = np.abs(np.sum(side) - np.sum(possible_right))
                    if new_difference < difference:
                        # update current side info
                        image_mosaic_dict[image_index][side_index][0] = possible_index
                        image_mosaic_dict[image_index][side_index][1] = new_difference

                        # update new neighboring side info
                        image_mosaic_dict[possible_index][1][0] = image_index
                        image_mosaic_dict[possible_index][1][1] = new_difference

                # side is right, search for lefts
                elif side_index == 1:

                        # if difference is greater, take the new side
                        new_difference = np.abs(np.sum(side) - np.sum(possible_left))
                        if new_difference < difference:
                            # update current side info
                            image_mosaic_dict[image_index][side_index][0] = possible_index
                            image_mosaic_dict[image_index][side_index][1] = new_difference

                            # update new neighboring side info
                            image_mosaic_dict[possible_index][0][0] = image_index
                            image_mosaic_dict[possible_index][0][1] = new_difference

                # side is top, search for bottoms
                elif side_index == 2:

                    # if difference is greater, take the new side
                    new_difference = np.abs(np.sum(side) - np.sum(possible_bottom))
                    if new_difference < difference:
                        # update current side info
                        image_mosaic_dict[image_index][side_index][0] = possible_index
                        image_mosaic_dict[image_index][side_index][1] = new_difference

                        # update new neighboring side info
                        image_mosaic_dict[possible_index][3][0] = image_index
                        image_mosaic_dict[possible_index][3][1] = new_difference

                # side is bottom, search for tops
                elif side_index == 3:
                    
                    # if difference is greater, take the new side
                    new_difference = np.abs(np.sum(side) - np.sum(possible_top))
                    if new_difference < difference:
                        # update current side info
                        image_mosaic_dict[image_index][side_index][0] = possible_index
                        image_mosaic_dict[image_index][side_index][1] = new_difference

                        # update new neighboring side info
                        image_mosaic_dict[possible_index][2][0] = image_index
                        image_mosaic_dict[possible_index][2][1] = new_difference

In [22]:
for e in image_mosaic_dict:
    print(e, image_mosaic_dict[e])

0 [[256, 77517], [40417, 91601], [18265, 113686], [18987, 115184]]
1 [[256, 89750], [40417, 120634], [18265, 145645], [18987, 140974]]
2 [[256, 122934], [40417, 125807], [18265, 145311], [18987, 160991]]
3 [[256, 64046], [40417, 90884], [18265, 116822], [18987, 94464]]
4 [[256, 72110], [40417, 97096], [18265, 85603], [18987, 101723]]
5 [[256, 115389], [40417, 129328], [18265, 151913], [18987, 153373]]
6 [[256, 112839], [40417, 130266], [18265, 142459], [18987, 157921]]
7 [[256, 52531], [40417, 68485], [18265, 88748], [18987, 106115]]
8 [[256, 119021], [40417, 127827], [18265, 155631], [18987, 151729]]
9 [[256, 122643], [40417, 136179], [18265, 157986], [18987, 160132]]
10 [[256, 109293], [40417, 120990], [18265, 144935], [18987, 146621]]
11 [[256, 103785], [40417, 115804], [18265, 138610], [18987, 140091]]
12 [[256, 117343], [40417, 131557], [18265, 153768], [18987, 154036]]
13 [[256, 74118], [40417, 74630], [18265, 115561], [18987, 91273]]
14 [[256, 121214], [40417, 137117], [18265, 1

In [None]:
# PEOPLE ON THE KAGGLE FORUMS SAY THAT, (in general), THE .tiff IMAGES WERE NOT 
# USEFUL. That might be the same case here.

# 3) Using the same algorithm get neighbours for all TIFF images.

# 4) Remove neighbours which is different for JPG and TIFF images.

# 5) Use diagonals to additional improvements. For example check that diagonal element have same neighbours as central one without conflicts and so on…

# 6) Use CNN averaged predictions for 4 or 8 neighbours as features for central element

# Preprocess data
---

In [32]:
# extract labels from training data
import pandas as pd

train_data = pd.read_csv('data/train_v2.csv/train_v2.csv')
print(train_data['tags'].values)

unique_labels = []
for line in train_data['tags'].values:
    for label in line.split():
        if label not in unique_labels:
            unique_labels.append(label)
            print(label)

print(len(unique_labels))

['haze primary' 'agriculture clear primary water' 'clear primary' ...
 'agriculture clear primary' 'agriculture clear primary road'
 'agriculture cultivation partly_cloudy primary']
haze
primary
agriculture
clear
water
habitation
road
cultivation
slash_burn
cloudy
partly_cloudy
conventional_mine
bare_ground
artisinal_mine
blooming
selective_logging
blow_down
17


# Construct model
---

# Train model
---

# View results
---