# Random Forest for coastal feature extraction.

The code and this notebook can be found on the github repo: https://github.com/Aryal007/ml_satellite_image_segmentation

In [None]:
from utils.data import Data
%matplotlib inline

* We encapsulate all our functions related to data processing in utils.data
* tiff_filename and shp_filename are two required parameters for Data class
* The other parameters include classes (Index for labels and its corresponding name), and default channel (for visualization in case of tiffs with multiple channels)
* We can use data.get_constants() to print all the parameters for data object

In [None]:
tiff_name = "./Data/Images/5_band3.tif"
shp_name = "./Data/Labels/5_band3.shp"
data = Data(tiff_name, shp_name, classes = ["water", "land"])
data.get_constants()

Once we have the object created for Data class
* data.read_tiff() -> reads the tiff_filename and returns rasterio object
* data.read_shp() -> reads shp_filename and returns geopandas object
* data.check_crs() -> raises an error if the crs of two objects don't match, verbose prints the crs for each object
* data.get_tiff_details(tiff) -> reads the rasterio object and prints details of the tiff

In [None]:
tiff = data.read_tiff()
shp = data.read_shp()
data.check_crs(shp.crs, tiff.crs, verbose=True)

In [None]:
data.get_tiff_details(tiff)

We need to convert geopandas polygon information to masks that correspond with given tiff image. For this, we can use data.get_mask(). This function burns the polygon into tiff file to produce image mask of dimensions tiff_height * tiff_width * len(classes) where classes represent the index, name dictionary for output labels. (Since we have only two labels (land and water), mask will have 2 channels)

In [None]:
mask = data.get_mask()

### Data Visualization
* data.view_tiff(tiff) -> reads rasterio object from read_tiff() and uses default channels to visualize as RGB
* you can also pass an integer as parameter channel to visualize that channel only
* data.view_mask(mask) -> reads the mask numpy array and visualizes it as labeled images

In [None]:
data.view_tiff(tiff)

In [None]:
data.view_tiff(tiff, channel=1)

In [None]:
data.view_mask(mask)

In [None]:
X, y = data.get_Xy(tiff, mask, n_sample = 2000000)
X_train, X_test, y_train, y_test = data.train_test_split(X, y, save=False)

In [None]:
data.get_histogram(X_train, y_train, channel = 1)

In [None]:
from utils.estimators import Dataset, Classifier

In [None]:
dataset = Dataset(X_train, X_test, y_train, y_test)
dataset.info()

In [None]:
classifier = Classifier()

### Lets create a test set from another tiff to test across geographic generalization

In [None]:
tiff_filename = "./Data/Images/5_band15.tif"
shp_filename = "./Data/Labels/5_band15.shp"
test_data = Data(tiff_filename, shp_filename, classes = ["water", "land"])
test_tiff = test_data.read_tiff() 
actual_mask = test_data.get_mask()
test_data.view_tiff(test_tiff)
test_data.view_mask(actual_mask)
_X, _y = data.get_Xy(test_tiff, actual_mask, n_sample = 2000000)
_X_train, _X_test, _y_train, _y_test = data.train_test_split(_X, _y, save=False)
test_dataset = Dataset(_X_train, _X_test, _y_train, _y_test)
test_data.get_histogram(X_train, y_train, channel = 1)

### Testing across samples from same tiff

In [None]:
classifier.random_forest(trainX=dataset.trainX, trainY=dataset.trainY, testX=dataset.testX, testY=dataset.testY,
                  grid_search=False, train=True, n_estimators = 10, max_depth = 3, feature_importance=True)

### Testing on a different geographical region

In [None]:
classifier.random_forest(trainX=dataset.trainX, trainY=dataset.trainY, testX=test_dataset.testX, testY=test_dataset.testY,
                  grid_search=False, train=True, n_estimators = 10, max_depth = 3, feature_importance=False)

### Using combined both images

In [None]:
import numpy as np

tiff_filename = "./Data/Images/5_band1.tif"
shp_filename = "./Data/Labels/5_band1.shp"
combined_data = Data(tiff_filename, shp_filename, classes = ["water", "land"])
combined_tiff = combined_data.read_tiff() 
combined_mask = combined_data.get_mask()
X, y = data.get_Xy(combined_tiff, combined_mask, n_sample = 100000)
tiff_filename = "./Data/Images/5_band15.tif"
shp_filename = "./Data/Labels/5_band15.shp"
combined_data = Data(tiff_filename, shp_filename, classes = ["water", "land"])
combined_tiff = combined_data.read_tiff() 
combined_mask = combined_data.get_mask()
_X, _y = data.get_Xy(combined_tiff, combined_mask, n_sample = 100000)
X, y = np.concatenate((X, _X), axis = 0), np.concatenate((y, _y), axis = 0)
X_train, X_test, y_train, y_test = data.train_test_split(X, y, save=False)
data.get_histogram(X_train, y_train, channel = 1)
added_test_dataset = Dataset(X_train, X_test, y_train, y_test)
classifier.random_forest(trainX=added_test_dataset.trainX, trainY=added_test_dataset.trainY, testX=added_test_dataset.testX, testY=added_test_dataset.testY,
                   grid_search=False, train=True, n_estimators = 10, max_depth = 3, feature_importance=True)

### Using a folderpath as input

In [None]:
tiff_location = "./Sample/Images/"
shp_location = "./Sample/Labels/"
all_data = Data(tiff_filename, shp_filename, classes = ["water", "land"])
all_tiff = all_data.read_tiff() 
all_mask = all_data.get_mask()
X, y = all_data.get_Xy(all_tiff, all_mask, n_sample = 100000)
X_train, X_test, y_train, y_test = all_data.train_test_split(X, y, save=False)
all_data.get_histogram(X_train, y_train, channel = 1)
all_dataset = Dataset(X_train, X_test, y_train, y_test)
classifier.random_forest(trainX=all_dataset.trainX, trainY=all_dataset.trainY, testX=all_dataset.testX, testY=all_dataset.testY,
                   grid_search=False, train=True, n_estimators = 10, max_depth = 3, feature_importance=True)

In [None]:
classifier.random_forest(trainX=all_dataset.trainX, trainY=all_dataset.trainY, testX=all_dataset.testX, testY=all_dataset.testY,
                   grid_search=False, train=True, n_estimators = 10, max_depth = 3, feature_importance=True)

In [None]:
all_data.view_mask(all_mask[0])
all_data.view_tiff(all_tiff[0])
prediction = classifier.get_labels(all_tiff[0],"/estimator.sav")
all_data.view_mask(prediction)