# Blue Rockfish Habitat Predictions

In [None]:
! pip install geopandas 
! pip install xgboost 
! pip install scikit-learn 
! pip install simpledbf

In [3]:
import os
import sys
import numpy as np
import matplotlib
import pandas as pd
import arcpy
import geopandas as gpd
import json
import pickle
import xgboost
import pickle
import xgboost as xgb
from xgboost import XGBClassifier
import sklearn
from sklearn.model_selection import GridSearchCV
from simpledbf import Dbf5
#import rasterio as rio

PyTables is not installed. No support for HDF output.
SQLalchemy is not installed. No support for SQL output.


In [None]:
def preprocess_eval_data(files):
    # define env
    arcpy.env.workspace = '../../data'
    arcpy.env.overwriteOutput = True
    save_path = '../../scratch/'

In [None]:
def preprocess_train_data(files):
    """
    description
    
    param files: 
    """
    
    # set up env
    arcpy.env.workspace = '../../data'
    arcpy.env.overwriteOutput = True
    save_path = '../../scratch/'
    
    # gather the shape files
    files = np.array(files)
    obs = files[0]
    rnd = files[1]
    
    # gather the raster files
    rasters = files[2:]
    rasters = list(rasters[rasters != '#'])

    # sample rasters from observed points
    obs_table = save_path + 'observed_presence_sampled.dbf'
    obs_points = arcpy.sa.Sample(rasters, obs, obs_table)
    obs_count = int(arcpy.management.GetCount(obs_table)[0])
    obs_label = np.array([1]*obs_count)
    
    # sample rasters from absence points
    rnd_table = save_path + 'random_absence_sampled.dbf'
    rnd_points = arcpy.sa.Sample(rasters, rnd, rnd_table)
    rnd_count = int(arcpy.management.GetCount(rnd_table)[0])
    rnd_label = np.array([0]*rnd_count)

    return obs_table, obs_label, rnd_table, rnd_label

In [None]:
def build_model(files):
    """
    Description
    
    param files: 
    """
    # preprocess data
    obs_path, obs_label, rnd_path, rnd_label = preprocess_train_data(files)
    
    # read training file
    obs_dbf = Dbf5(obs_path)
    rnd_dbf = Dbf5(rnd_path)
    obs_df = obs_dbf.to_dataframe()
    rnd_df = rnd_dbf.to_dataframe()

    # prepare data for training
    obs_df = obs_df.drop(columns = ['brf_obs', 'X', 'Y'])
    rnd_df = rnd_df.drop(columns = ['rand_obs', 'X', 'Y'])
    obs_df['label'] = obs_label
    rnd_df['label'] = rnd_label
    data = pd.concat([obs_df, rnd_df])
    X = data.drop(columns = ['label'])
    Y = data['label']
    
    # build xgboost model
    clf = XGBClassifier(objective= 'binary:logistic')
    parameters = {
        'max_depth': range (2, 10),
        'n_estimators': range(60, 220, 40),
        'learning_rate': [0.1, 0.01, 0.05]
    }
    grid_search = GridSearchCV(
        estimator = clf,
        param_grid = parameters,
        scoring = 'roc_auc',
        n_jobs = 5,
        cv = 5,
        verbose = True)
    grid_search.fit(X, Y)
    
    acc = sklearn.metrics.accuracy_score(grid_search.predict(X), Y)
    print('Training Accuracy is: ' + str(acc))
    
    model_path = "../../scratch/xgb_model.pkl"
    pickle.dump(grid_search, open(model_path, "wb"))

In [None]:
def make_prediction(files):
    """
    Description
    
    param files
    """
    files = files[0]
    
    # process files for prediction
    data = preprocess_eval_data(files)
    
    # read model
    model = pickle.load(open('../../scratch/xgb_model.pkl', 'rb'))
    
    # save raster file

### Run Training Pipeline

In [None]:
training_input_files = [ 'V:\\ENV859_Final_Project_al512\\data\\brf_obs.shp', 
                        'V:\\ENV859_Final_Project_al512\\data\\rand_obs.shp', 
                        'V:\\ENV859_Final_Project_al512\\data\\bathy', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_1', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_2', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_3', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_4', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_5', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_6', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_7', 
                        'V:\\ENV859_Final_Project_al512\\data\\habras10_8', 
                        'V:\\ENV859_Final_Project_al512\\data\\dist_kelp', 
                        'V:\\ENV859_Final_Project_al512\\data\\dist_100m', 
                        'V:\\ENV859_Final_Project_al512\\data\\botc10_8ws', '#', '#']
build_model(training_input_files)

### Run Prediction Pipeline

In [None]:
pred_input_files = ['V:\\ENV859_Final_Project_al512\\data\\bathy', 
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_1', 
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_2', 
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_3', ss
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_4', 
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_5', 
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_6', 
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_7', 
                    'V:\\ENV859_Final_Project_al512\\data\\habras10_8', 
                    'V:\\ENV859_Final_Project_al512\\data\\dist_kelp', 
                    'V:\\ENV859_Final_Project_al512\\data\\dist_100m', 
                    'V:\\ENV859_Final_Project_al512\\data\\botc10_8ws', '#', '#']
raster = make_predictions(pred_input_files)

In [None]:
import rasterio as rio