In [93]:
# basic packages
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")


#Shared/Utility scripts
import sys
sys.path.insert(0, '../..') #sys allows for the .ipynb file to connect to the shared folder files
from shared_scripts import Simple_Eval, dataloader, xgb_model, xgb_dataprocessing


HOME = os.path.expanduser('~')
modelname = 'XGBoost'
model_path = f"{HOME}/NWM_ML/Model/{modelname}"

print(f"{modelname} development script")

XGBoost development script


## 2. Prepare the data.

In [12]:
modelname = 'XGBoost'
model_path = f"{HOME}/NWM_ML/Model/{modelname}"

cfsday_AFday = 1.983

#input columns
input_columns =[
                'Lat', 
                'Long', 
                'Drainage_area_mi2', 
                'Mean_Basin_Elev_ft',       
                'Perc_Forest', 
                'Perc_Develop', 
                'Perc_Imperv', 
                'Perc_Herbace',       
                'Perc_Slop_30', 
                'Mean_Ann_Precip_in', 
                's1',       
                's2', 
                'storage', 
                'swe', 
                'NWM_flow', 
                'DOY', 
                'tempe(F)', 
                'precip(mm)'
                ]

target = 'flow_cfs'

test_years = [2019, 2020]                 

#load data
datapath = f"{HOME}/NWM_ML/Data/input"
trainingfile = "final_input.parquet"

df, StreamStats = dataloader.get_ML_Data(datapath, trainingfile)
df.head()

df needs no processing


Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Mean_Basin_Elev_ft,Perc_Forest,Perc_Develop,Perc_Imperv,Perc_Herbace,Perc_Slop_30,...,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY,tempe(F),precip(mm)
0,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2010-10-28,78.55521,-0.891007,-0.453991,0.0,1.2,55.0,301,39.239582,0.0
1,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2010-10-29,98.61146,-0.891007,-0.453991,0.0,1.2,55.0,302,45.068712,0.0
2,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2010-10-30,97.60208,-0.891007,-0.453991,0.0,1.1,54.0,303,50.945891,0.0
3,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2010-10-31,99.33125,-0.891007,-0.453991,0.0,1.2,54.0,304,45.480097,0.0
4,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,...,2010-11-01,95.76354,-0.99863,0.052336,0.0,1.2,54.0,305,46.656777,0.0


### Dataprocessing
* Editing the features based on the feature importance
* Remove headwater stations from dataset
* make sure dates are in datetime format

In [89]:
#get non headwater stations
headwater_stations = ['10011500', # Bear River headwaters before WY state line
                      '10109000', # Logan River above dams
                      '10113500', # HW Blacksmith fork
                      '10128500', # Upper Weber above Oakley
                      '10131000', #Chalk creek before Weber - lots of upstream irrigation, potentially include
                        '10146400', #Currant Creek above Mona Reservoir - lots of upstream irrigation, potentially include
                        '10150500', #Spanish fork after diamond fork - potentially include because of 6th water diversion CUP
                        '10154200', #Upper Provo river after confluence of N/S forks - potentially include because of duchense tunnel water diversion CUP
                        '10172700', #Vernon creek 2 ranges west of Utah Lake, shouldnt be included because not in GSL basin 
                        '10172800', #Willow creek west of Gransville,  shouldnt be included because does not make it to GSL
                          '10172952'
                          ] #Dunn creek in Raft River Range, shouldnt be included because drains to bonnevile salt flats 

#remove headwater stations
df = df[~df['station_id'].isin(headwater_stations)]

#get stations with correct swe and storage features
#The following sites have swe 

'''
['10011500', '10105900', '10109000', '10126000', '10131000',
       '10133650', '10133800', '10133980', '10134500', '10136500',
       '10140700', '10141000', '10150500', '10154200', '10155000',
       '10155200']
'''

#the following sites have swe and storage
'''
['10126000', '10134500', '10136500', '10140700', '10141000',
       '10155200']
'''

stations = df['station_id'][(df['swe']>0) & (df['storage']>0)].unique()

#Train model with these stations
df = df[df['station_id'].isin(stations)]

#convert dates to datetime format
df.datetime = pd.to_datetime(df.datetime)

# #reset index to clean up df
df.reset_index( inplace =  True, drop = True)

#fit a scaler,save, and scale the training data
x_train, y_train, x_test, y_test, station_index_list = xgb_dataprocessing.xgb_train_test(
                                                      df,
                                                      test_years, 
                                                      target, 
                                                      input_columns
                                                      )



## 3. XGBOOST Model Preparation.

### 3.3. Run and evaluate the model
Here first we train the model and then we test it.
We do it 30 times so we have firm evaluation. 

In [74]:
#Train model
tries = 1 #what is tries?
hyperparameters = {
    'max_depth': range (2, 11, 5),
    'n_estimators': range(100, 2100, 1000),
    'eta': [0.1,]
}
perc_data = 0.25 # percent of training data used to identify optimial hyperparameters
xgb_model.XGB_Train(model_path, 
                    station_index_list, 
                    x_train, 
                    y_train, 
                    tries, 
                    hyperparameters,
                    perc_data)

Tuning hyperparametetrs on 25.0% of training data
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END eta=0.1, max_depth=2, n_estimators=100;, score=-108.536 total time=   2.0s
[CV 2/3] END eta=0.1, max_depth=2, n_estimators=100;, score=-106.127 total time=   3.2s
[CV 3/3] END eta=0.1, max_depth=2, n_estimators=100;, score=-104.032 total time=   3.3s
[CV 3/3] END eta=0.1, max_depth=7, n_estimators=100;, score=-58.412 total time=   4.5s
[CV 1/3] END eta=0.1, max_depth=7, n_estimators=100;, score=-62.441 total time=   6.2s
[CV 2/3] END eta=0.1, max_depth=7, n_estimators=100;, score=-64.493 total time=   6.7s
[CV 1/3] END eta=0.1, max_depth=2, n_estimators=1100;, score=-88.140 total time=  13.1s
[CV 2/3] END eta=0.1, max_depth=2, n_estimators=1100;, score=-84.541 total time=  14.4s
[CV 3/3] END eta=0.1, max_depth=2, n_estimators=1100;, score=-83.786 total time=  15.0s
[CV 2/3] END eta=0.1, max_depth=7, n_estimators=1100;, score=-60.203 total time=  26.2s
[CV 3/3] END 

In [92]:
#test model
Preds_Dict = xgb_model.XGB_Predict(model_path,
                      modelname,  
                      df,
                      x_test, 
                      y_test, 
                      test_years, 
                      StreamStats, 
                      station_index_list)
