# Run ML model

In [1]:
%load_ext autoreload
%autoreload 2

#from keras.layers import merge
from src.data import utils
from src.models.data import *
from src.models.model import *
from src.models.predict_model import *
from src.data.utils import get_tile_prefix

#from rasterio.plot import show, reshape_as_image
import matplotlib.pyplot as plt
import numpy as np

from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from scipy import signal

from pathlib import Path
import os, shutil
import sys
import time
%matplotlib inline

Using TensorFlow backend.


## User settings

In [2]:
# paths to append
sys.path.append("/home/ubuntu/roaddetection/")
sys.path.append("/media/hh/hd_internal/hh/DSR_Berlin_2018/roaddetection/")

# base directories with data (image tiles) to be analyzed
base_dir = "../../data"
dirs = []
train_dir = os.path.join(base_dir, "train")
dirs.append(train_dir)
train_partial_dir = os.path.join(base_dir, "special")
dirs.append(train_dir)
validation_dir = os.path.join(base_dir, "validate")
dirs.append(validation_dir)
# subdirs
dir_x = 'sat'
dir_y = 'map'

# max. number of samples (files) to analyze
max_num_x = 5

# ------------- image characteristics -----------------------------
# size of tiles
target_size = (512,512)


#--------------- model ----------------------------------------------------
# set to True if a binary model shall be run
model_is_binary = True
# path to & filename of model to save
trained_model_fn = '../../models/RandomForest_binary.pkl'
if False:
    # set to True if a binary model shall be run
    model_is_binary = False
    # path to & filename of model to save
    trained_model_fn = '../../models/RandomForest_multiclass.pkl'

#--------------- training details / hyperparameters -----------------------------------

# graphics defaults
figsize = (20,12)
plt.rcParams["figure.figsize"] = figsize

In [3]:
# obtain list and number of available samples (files)
file_list_x, num_x = utils.get_list_samplefiles(os.path.join(train_dir, dir_x))

# actual number of samples that will be used for training, given samples available and user's choice
num_x_use = min(num_x, max_num_x)


### Load data

In [4]:
CLASS_DICT = get_class_dict()
# set up list of functions producing matrices to be used in sequence for convolutional feature engineering
conv_matrix_fun = [conv_matrix_inhibsurround, 
                  conv_matrix_horizontalbar, 
                  conv_matrix_verticalbar,
                  conv_matrix_diag_ullr,
                  conv_matrix_diag_llur]

conv_matrix_fun = [conv_matrix_inhibsurround]

# number of features is the original number of bands plus the convolutions defined above
num_features = 4 + len(conv_matrix_fun)
# number of pixels per image
img_size = np.prod(target_size)
# preallocate arrays collecting features (x) and labels (y) of all samples
arr_x = np.empty((img_size * num_x_use, num_features), dtype=np.float32)
arr_y = np.empty(img_size * num_x_use, dtype=np.uint8)

for i, fn in enumerate(file_list_x[:num_x_use]):
    # read sat image tile
    x = io.imread(os.path.join(train_dir, dir_x, fn))
    # feature engineering
    x_f = feature_eng_conv(x, conv_matrix_fun, collapse_bands=True)
    # read corresponding label tile
    y = io.imread(os.path.join(train_dir, dir_y, fn))  
    # refactor labels
    y, mask = refactor_labels(x, y, class_dict=CLASS_DICT, model_is_binary=model_is_binary, meta=None)
    # scale x
    x = x/255.0
    # now append new features to x
    x = np.append(x, x_f, axis=2)
    print("{0:s}: ({1:0.0f} % non-image pixels)...".format(fn, 100*np.sum(mask)/img_size))
    # copy flattened features and labels in arrays
    arr_y[i*img_size:(i+1)*img_size] = y.reshape(img_size, order = 'C')
    arr_x[i*img_size:(i+1)*img_size,:] =x.reshape((img_size, num_features), order = 'C')
    
# retain all except no_img values
good_ix = arr_y != CLASS_DICT["no_img"]
arr_x = arr_x[good_ix, :]
arr_y = arr_y[good_ix]
print("{} classes present in data".format(len(np.unique(arr_y))))


20180419_074325_0c43_3B_0014.tif: (0 % non-image pixels)...
20180419_074325_0c43_3B_0015.tif: (0 % non-image pixels)...
20180419_074325_0c43_3B_0017.tif: (0 % non-image pixels)...
20180419_074325_0c43_3B_0018.tif: (0 % non-image pixels)...
20180419_074325_0c43_3B_0019.tif: (0 % non-image pixels)...
2 classes present in data


### Define model

In [5]:
mdl = RandomForestClassifier(
    n_estimators=5,
    criterion='gini',
    max_depth=8,
    min_samples_split=100,
    min_samples_leaf=50,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=None,
    verbose=1,
    warm_start=True,
    class_weight=None)


### Run training

In [6]:
# for now, divide large of data into smaller chunks of chunk_size samples and loop over these
chunk_size = 1e5
chunk_idx = np.int32(np.linspace(0, arr_x.shape[0], arr_x.shape[0]//int(chunk_size)))
t1 = time.time()
for i in range(len(chunk_idx)-1):
    print("chunk {}".format(i+1))
    mdl.fit(arr_x[chunk_idx[i]:chunk_idx[i+1],:], arr_y[chunk_idx[i]:chunk_idx[i+1]])
    
t2 = time.time()
print("Model fitting finished after {0:0.0f} s wall clock time".format(t2-t1))

chunk 1
chunk 2
chunk 3
chunk 4
chunk 5
chunk 6
chunk 7
chunk 8
chunk 9
chunk 10
chunk 11
chunk 12
Model fitting finished after 0 s wall clock time


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
  warn("Warm-start fitting without increasing n_estimators does not "


In [7]:
mdl.feature_importances_

array([0.36658345, 0.08498371, 0.15107657, 0.1287924 , 0.06574827,
       0.03611766, 0.11949418, 0.03025478, 0.01694899])

In [8]:
# save model
joblib.dump(mdl, trained_model_fn) 

['../../models/RandomForest_binary.pkl']

In [9]:
sys.exit()

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
xix1, xix2 = 0, 1
f,axs = plt.subplots(2,2)
axs[0,0].imshow(x[:,:,:3])
axs[0,1].imshow(y, cmap="gray")
axs[1,0].scatter(x[y==200,xix1], x[y==200,xix2])
axs[1,1].scatter(arr_x[arr_y==200,xix1], arr_x[arr_y==200,xix2])
