In [1]:
%matplotlib inline
from collections import defaultdict
import csv
import os
import sys

import matplotlib.pyplot
import cv2
from shapely.geometry import MultiPolygon, Polygon
import shapely.wkt
import shapely.affinity
import numpy as np
import tifffile as tiff
import time
import pandas as pd
np.random.seed(42)
import math
from PIL import Image
csv.field_size_limit(sys.maxsize);
cur_dir = '/home/rob/Udacity/capstone/data'

ClassNames = {'1':'buildings', '2':'Misc. Manmade structures', '3': 'Road', '4':'Track', '5':'Trees',
                    '6':'Crops', '7':'Waterway', '8':'Standing water', '9':'Vehicle Large ', '10':'Vehicle Small'}

In [2]:
classifiers = {}
train_masks = {}
pred_masks = {}

trainIM_IDs = []
with open(cur_dir + '/train_wkt_v4.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar=',')
    for i,row in enumerate(reader):
        if i == 0:
            i = 1
        if (i%10) == 0:
            trainIM_IDs.append(row[0])

testIM_IDs = []
with open(cur_dir + '/sample_submission.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar=',')
    for i,row in enumerate(reader):
        if i == 0:
            i = 1
        if (i%10) == 0:
            testIM_IDs.append(row[0])

In [8]:
def get_scalers(im_size):
    h, w = im_size  # they are flipped so that mask_for_polygons works correctly
    h, w = float(h), float(w)
    w_ = w * (w / (w + 1))
    h_ = h * (h / (h + 1))
    return w_ / x_max, h_ / y_min

def mask_for_polygons(polygons):
    img_mask = np.zeros(im_size, np.uint8)
    if not polygons:
        return img_mask
    int_coords = lambda x: np.array(x).round().astype(np.int32)
    exteriors = [int_coords(poly.exterior.coords) for poly in polygons]
    interiors = [int_coords(pi.coords) for poly in polygons
                 for pi in poly.interiors]
    cv2.fillPoly(img_mask, exteriors, 1)
    cv2.fillPoly(img_mask, interiors, 0)
    return img_mask
    
def mask_to_polygons(mask, epsilon=10., min_area=10.):
    # first, find contours with cv2: it's much faster than shapely
    image, contours, hierarchy = cv2.findContours(
        ((mask == 1) * 255).astype(np.uint8),
        cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_KCOS)
    # create approximate contours to have reasonable submission size
    approx_contours = [cv2.approxPolyDP(cnt, epsilon, True)
                       for cnt in contours]
    if not contours:
        return MultiPolygon()
    # now messy stuff to associate parent and child contours
    cnt_children = defaultdict(list)
    child_contours = set()
    assert hierarchy.shape[0] == 1
    # http://docs.opencv.org/3.1.0/d9/d8b/tutorial_py_contours_hierarchy.html
    for idx, (_, _, _, parent_idx) in enumerate(hierarchy[0]):
        if parent_idx != -1:
            child_contours.add(idx)
            cnt_children[parent_idx].append(approx_contours[idx])
    # create actual polygons filtering by area (removes artifacts)
    all_polygons = []
    for idx, cnt in enumerate(approx_contours):
        if idx not in child_contours and cv2.contourArea(cnt) >= min_area:
            assert cnt.shape[1] == 1
            poly = Polygon(
                shell=cnt[:, 0, :],
                holes=[c[:, 0, :] for c in cnt_children.get(idx, [])
                       if cv2.contourArea(c) >= min_area])
            all_polygons.append(poly)
    # approximating polygons might have created invalid ones, fix them
    all_polygons = MultiPolygon(all_polygons)
    if not all_polygons.is_valid:
        all_polygons = all_polygons.buffer(0)
        # Sometimes buffer() converts a simple Multipolygon to just a Polygon,
        # need to keep it a Multi throughout
        if all_polygons.type == 'Polygon':
            all_polygons = MultiPolygon([all_polygons])
    return all_polygons

def pieceCutter(im_size, cuts, IM_ID, image, trainMasks):
    x_cutter = math.floor(im_size[0]/cuts)
    y_cutter = math.floor(im_size[1]/cuts)
    A = [int(x_cutter*i) for i in range(cuts+1)]
    B = [int(y_cutter*i) for i in range(cuts+1)]
    
    for x in range(cuts):
        for y in range(cuts):
            print "in loop", x, y, "report_visualization/{}-{}-{}".format(IM_ID,x,y)
            cutPicture = im[A[x]:A[x+1],B[y]:B[y+1],:]
            print cutPicture.shape
            cutMasks = trainMasks[A[x]:A[x+1],B[y]:B[y+1],:]
            print cutMasks.shape
            np.savez_compressed("report_visualization/{}-{}-{}".format(IM_ID,x,y), cutPicture, cutMasks)
            
def sizeNormalizer(minsize,im,trainMasks):
    image_out = im[:minsize[0],:minsize[1],:]
    mask_out = trainMasks[:minsize[0],:minsize[1],:]
    return image_out, mask_out

In [9]:
""" Cut pictures into 1 size """
import cv2

#These values were determined in a later part of this notebook
rgb_minsize = [3345, 3389]
M_minsize = [834, 832]
streets_minsize = [500,500]
A_minsize = [133, 132]
cars_minsize = [1700,1700]

minsize = M_minsize
nrCuts = 1
print len(trainIM_IDs)


for IM_ID in trainIM_IDs[:]:
    print IM_ID
    """
    im_rgb = tiff.imread(cur_dir +'/three_band/{}.tif'.format(IM_ID)).transpose([1, 2, 0])
    resized_image_rgb = cv2.resize(im_rgb, (minsize[0],minsize[1]), interpolation=cv2.INTER_AREA)
    im_size = resized_image_rgb.shape[:2]
    
    im = np.empty((minsize[0],minsize[1],3),dtype=np.uint16)
    trainMasks = np.empty([im_size[0],im_size[1],10])
    im = resized_image_rgb
    """
    
    trainMasks = np.empty([minsize[0],minsize[1],10])

    
    
    im = tiff.imread(cur_dir +'/sixteen_band/{}_A.tif'.format(IM_ID)).transpose([1, 2, 0])
    resized_image_A = cv2.resize(im, (minsize[1], minsize[0]), interpolation=cv2.INTER_CUBIC)

    im = tiff.imread(cur_dir +'/sixteen_band/{}_M.tif'.format(IM_ID)).transpose([1, 2, 0])
    resized_image_M = cv2.resize(im, (minsize[1], minsize[0]), interpolation=cv2.INTER_LINEAR)
    im_size = resized_image_M.shape[:2]
    
    im = np.empty((minsize[0], minsize[1],16),dtype=np.uint16)
    im[:,:,:8] = resized_image_M[:,:,:]
    im[:,:,8:16] = resized_image_A[:,:,:]



    x_max = y_min = None
    for _im_id, _x, _y in csv.reader(open(cur_dir + '/grid_sizes.csv')):
        if _im_id == IM_ID:
            x_max, y_min = float(_x), float(_y)
            break

    for i in range(1,11):
        POLY_TYPE = str(i)

        train_polygons = None
        for _im_id, _poly_type, _poly in csv.reader(open(cur_dir + '/train_wkt_v4.csv')):
            if _im_id == IM_ID and _poly_type == POLY_TYPE:
                train_polygons = shapely.wkt.loads(_poly)
                break

        x_scaler, y_scaler = get_scalers(im_size)

        train_polygons_scaled = shapely.affinity.scale(
            train_polygons, xfact=x_scaler, yfact=y_scaler, origin=(0, 0, 0))

        #train_mask = np.empty_like(im[:,:,:10])
        train_mask = mask_for_polygons(train_polygons_scaled)
        trainMasks[:,:,(int(i)-1)] = train_mask #trainMasks: im_size,im_size,10 classes

    #train_mask = np.empty_like(im[:,:,:10])
    pieceCutter(minsize, nrCuts, IM_ID, im, trainMasks)

    
print "job done"

25
6040_2_2
in loop 0 0 report_visualization/6040_2_2-0-0
(834, 832, 16)
(834, 832, 10)
6120_2_2
in loop 0 0 report_visualization/6120_2_2-0-0
(834, 832, 16)
(834, 832, 10)
6120_2_0
in loop 0 0 report_visualization/6120_2_0-0-0
(834, 832, 16)
(834, 832, 10)
6090_2_0
in loop 0 0 report_visualization/6090_2_0-0-0
(834, 832, 16)
(834, 832, 10)
6040_1_3
in loop 0 0 report_visualization/6040_1_3-0-0
(834, 832, 16)
(834, 832, 10)
6040_1_0
in loop 0 0 report_visualization/6040_1_0-0-0
(834, 832, 16)
(834, 832, 10)
6100_1_3
in loop 0 0 report_visualization/6100_1_3-0-0
(834, 832, 16)
(834, 832, 10)
6010_4_2
in loop 0 0 report_visualization/6010_4_2-0-0
(834, 832, 16)
(834, 832, 10)
6110_4_0
in loop 0 0 report_visualization/6110_4_0-0-0
(834, 832, 16)
(834, 832, 10)
6140_3_1
in loop 0 0 report_visualization/6140_3_1-0-0
(834, 832, 16)
(834, 832, 10)
6110_1_2
in loop 0 0 report_visualization/6110_1_2-0-0
(834, 832, 16)
(834, 832, 10)
6100_2_3
in loop 0 0 report_visualization/6100_2_3-0-0
(834, 8

In [None]:
''' Determine smallest size of pictures '''

all_pics = trainIM_IDs + testIM_IDs

liste = []
counter = 0
for IM_ID in all_pics:
    print "pic {}".format(counter)
    im = tiff.imread(cur_dir +'/sixteen_band/{}_M.tif'.format(IM_ID)).transpose([1, 2, 0])
    im_size = im.shape[:2]
    im = tiff.imread(cur_dir +'/three_band/{}.tif'.format(IM_ID)).transpose([1, 2, 0])
    im_size = im.shape[:2]
    liste.append(im_size)
    counter += 1

min(liste)

In [10]:
import os
import numpy
from random import shuffle
paths = os.listdir("report_visualization/")
print paths

['6170_4_1-0-0.npz', '6100_1_3-0-0.npz', '6160_2_1-0-0.npz', '6010_1_2-0-0.npz', '6010_4_4-0-0.npz', '6090_2_0-0-0.npz', '6040_2_2-0-0.npz', '6140_1_2-0-0.npz', '6120_2_2-0-0.npz', '6170_2_4-0-0.npz', '6100_2_3-0-0.npz', '6110_3_1-0-0.npz', '6150_2_3-0-0.npz', '6140_3_1-0-0.npz', '6110_4_0-0-0.npz', '6040_4_4-0-0.npz', '6060_2_3-0-0.npz', '6170_0_4-0-0.npz', '6110_1_2-0-0.npz', '6040_1_3-0-0.npz', '6070_2_3-0-0.npz', '6120_2_0-0-0.npz', '6010_4_2-0-0.npz', '6040_1_0-0-0.npz', '6100_2_2-0-0.npz']


In [None]:
shuffle(paths)

In [12]:
train_split = paths[:]
test_split = paths[20:]
print len(train_split), len(test_split)

masks1 = []
masks2 = []
print "start loading files.."
print range(8,10)
for path in train_split:
    data = numpy.load("report_visualization/"+path)
    masks1.append(data['arr_1'])
    
for path in test_split:
    data = numpy.load("report_visualization/"+path)
    masks2.append(data['arr_1'])
print "files loaded" 
mean = numpy.empty(10)
num = numpy.empty(10)
mean2 = numpy.empty(10)
num2 = numpy.empty(10)

for i in range(0,10):
    n_class_train = [sum(sum(mask[:,:,i])) for mask in masks1]
    n_class_test = [sum(sum(mask[:,:,i])) for mask in masks2]
    [num[i], mean[i]]=[sum(n_class_train),numpy.mean(n_class_train)]
    [num2[i], mean2[i]]=[sum(n_class_test),numpy.mean(n_class_test)]
    
all_pixels = minsize[0]*minsize[1]*25

print "all pixels:", all_pixels
print "number of class pixels", num
print "percentage of class-pixels per all pixels", num/all_pixels


25 5
start loading files..
[8, 9]
files loaded
all pixels: 17347200
number of class pixels [  7.39901000e+05   2.77289000e+05   1.55024000e+05   5.46430000e+05
   2.29509900e+06   4.83879000e+06   9.19360000e+04   3.28470000e+04
   1.08100000e+03   6.44700000e+03]
percentage of class-pixels per all pixels [  4.26524742e-02   1.59846546e-02   8.93654307e-03   3.14996080e-02
   1.32303715e-01   2.78937811e-01   5.29976019e-03   1.89350443e-03
   6.23155322e-05   3.71644992e-04]


In [None]:
final_train_split = train_split
final_test_split = test_split


masks = []
images = []

counter = 0
for path in final_train_split:
    data = numpy.load("all_bandsCuts100/"+path)
    mask = data['arr_1']
    images = data['arr_0']
    np.savez_compressed("train_cuts/{}".format(path[:-4]), images, mask)
    
    print counter
    counter += 1

masks = []
images = []


for path in final_test_split:
    data = numpy.load("all_bandsCuts100/"+path)
    mask = data['arr_1']
    images = data['arr_0']
    np.savez_compressed("test_cuts/{}".format(path[:-4]), images, mask)
    
    print counter
    counter += 1