# Run ML model

In [None]:
%load_ext autoreload
%autoreload 2

#from keras.layers import merge
from src.data import utils
from src.models.data import *
from src.models.model import *
from src.models.predict_model import *
from src.data.utils import get_tile_prefix

#from rasterio.plot import show, reshape_as_image
import matplotlib.pyplot as plt
import numpy as np

from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from scipy import signal

from pathlib import Path
import os, shutil
import sys
import time
%matplotlib inline

## User settings

In [None]:
# paths to append
sys.path.append("/home/ubuntu/roaddetection/")
sys.path.append("/media/hh/hd_internal/hh/DSR_Berlin_2018/roaddetection/")

# base directories with data (image tiles) to be analyzed
base_dir = "../../data"
dirs = []
train_dir = os.path.join(base_dir, "train")
dirs.append(train_dir)
validation_dir = os.path.join(base_dir, "validate")
dirs.append(validation_dir)
# subdirs
dir_x = 'sat'
dir_y = 'map'

# max. number of samples (files) to analyze
max_num_x = 3

# ------------- image characteristics -----------------------------
# size of tiles
target_size = (512,512)


#--------------- model ----------------------------------------------------
# set to True if a binary model shall be run
model_is_binary = True
# path to & filename of model to save
trained_model_fn = '../../models/RandomForest_binary.pkl'
if True:
    # set to True if a binary model shall be run
    model_is_binary = False
    # path to & filename of model to save
    trained_model_fn = '../../models/RandomForest_multiclass.pkl'

#--------------- training details / hyperparameters -----------------------------------

# graphics defaults
figsize = (20,12)
plt.rcParams["figure.figsize"] = figsize

In [None]:
# obtain list and number of available samples (files)
file_list_x, num_x = utils.get_list_samplefiles(os.path.join(train_dir, dir_x))

# actual number of samples that will be used for training, given samples available and user's choice
num_x_use = min(num_x, max_num_x)


### Define model

In [None]:
mdl = RandomForestClassifier(
    n_estimators=10,
    criterion='gini',
    max_depth=5,
    min_samples_split=50,
    min_samples_leaf=20,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=None,
    verbose=1,
    warm_start=False,
    class_weight=None)


## Feature engineering

In [None]:
# matrices to be used for convolution
def conv_matrix_inhibsurround():
    """
    3 by 3, positive center, negative surround
    Elements sum to zero
    """
    m = np.ones((3, 3), dtype=np.float32) / -8.0
    m[1, 1] = 1.0
    return m

def conv_matrix_horizontalbar():
    """
    3 by 3, positive center row, negative surround
    Elements sum to zero
    """
    m = np.ones((3, 3), dtype=np.float32) / -6.0
    m[1, :] = 1.0/3.0
    return m






    

### Load data

In [None]:
CLASS_DICT = get_class_dict()
# 
num_features = 4
# number of pixels per image
img_size = np.prod(target_size)
# preallocate arrays collecting features (x) and labels (y) of all samples
arr_x = np.empty((img_size * num_x_use, num_features), dtype=np.float32)
arr_y = np.empty(img_size * num_x_use, dtype=np.uint8)

m_inhibsurround = conv_matrix_inhibsurround()
m_horizontalbar = conv_matrix_horizontalbar()


for i, fn in enumerate(file_list_x[:num_x_use]):
    # read sat image tile
    x = io.imread(os.path.join(train_dir, dir_x, fn))
    print(x.dtype)
    # feature engineering
    x_filt = feature_eng_conv(x, m_horizontalbar, collapse_bands=True)

    
    fig, axs = plt.subplots(1,2)
    fig.figsize=(20,20)
    axs[0].imshow(x[:,:,:3])
    #axs[1].imshow(x_filt[:,:,3])
    axs[1].imshow(x_filt, cmap="gray")

    
    # read corresponding label tile
    y = io.imread(os.path.join(train_dir, dir_y, fn))  
    # refactor labels
    y, mask = refactor_labels(x, y, class_dict=CLASS_DICT, model_is_binary=model_is_binary, meta=None)
    # scale x
    x = x/255.0
    print("{0:s}: ({1:0.0f} % non-image pixels)...".format(fn, 100*np.sum(mask)/img_size))
    # copy flattened features and labels in arrays
    arr_y[i*img_size:(i+1)*img_size] = y.reshape(img_size, order = 'C')
    arr_x[i*img_size:(i+1)*img_size,:] =x.reshape((img_size, num_features), order = 'C')
    
# retain all except no_img values
good_ix = arr_y != CLASS_DICT["no_img"]
arr_x = arr_x[good_ix, :]
arr_y = arr_y[good_ix]
print("{} classes present in data".format(len(np.unique(arr_y))))


In [None]:
sys.exit()

### Run training

In [None]:
t1 = time.time()
mdl.fit(arr_x, arr_y)
t2 = time.time()
print("Model fitting finished after {0:0.0f} s wall clock time".format(t2-t1))

In [None]:
mdl.feature_importances_

In [None]:
# save model
joblib.dump(mdl, trained_model_fn) 

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

arr_x_plot = arr_x[::100,:]
arr_y_plot = arr_y[::100]


fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111, projection='3d')
ix = arr_y_plot == 40
ax.scatter(arr_x_plot[ix,0], arr_x_plot[ix,1], arr_x_plot[ix,3], c="gray", alpha=0.05)  # , c=c, marker=m
ix = arr_y_plot == 200
ax.scatter(arr_x_plot[ix,0], arr_x_plot[ix,1], arr_x_plot[ix,3], c="red")  # , c=c, marker=m


ax.set_xlabel('B')
ax.set_ylabel('G')
ax.set_zlabel('IR')

plt.show()