In [1]:
# This notebook applies the GXBoost model to a years worth a cliamte data. 

In [14]:
import pickle
import glob
from tqdm.auto import tqdm
import os
from datetime import datetime
import dateutil.relativedelta
import json
from pathlib import Path
from threading import Thread
import warnings

import geopandas as gpd
import pandas as pd
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from multiprocess import Pool

In [15]:
model_name = 'dam_forecast_11-10-22.pkl'

In [17]:
# read in vars
%store -r dam_forcast_working_dir
%store -r min_year
%store -r max_year
%store -r climate_types
%store -r point_data_input_path
%store -r dam_forcast_working_dir
%store -r time_step_folder
# set up some folders
output_folder = os.path.join(dam_forcast_working_dir,'monthly output v2')
Path(output_folder).mkdir(exist_ok=True)
# location of model file
model_folder = os.path.join(os.getcwd(),'data')
model_path = os.path.join(model_folder,model_name)
'found model?',os.path.isfile(model_path)

('found model?', True)

In [18]:
# load in data from meta file
with open(model_path.replace('.pkl','.json'), 'r') as fp:
    meta_dict = json.load(fp)

In [19]:
# load up model
xgb_reg_models = pickle.load(open(model_path, "rb"))

In [20]:
# get a refence to each time step
time_steps = glob.glob(time_step_folder+'/*.pkl')
len(time_steps)

1032

In [21]:
# open vector data
dam_points = gpd.read_file(point_data_input_path)
dam_points.head()
# reproject to WGS 84 if not already
if dam_points.crs.to_epsg() != 4326:
    print('Reprojecting')
    dam_points = dam_points.to_crs("EPSG:4326")

In [22]:
# this func takes in input date and return 13 months worth climate data for each point
def build_inf_df(start_date):
#     convert string date to datetime
    start_date_dt = datetime.strptime(start_date, '%Y%m')
    point_sample_df_list = []
#     loop over each calimte and relative month
    for climate_type in climate_types:
            for month_shift in range(0,13):
#                 get actual date from relative month
                shifted_date_dt = start_date_dt - dateutil.relativedelta.relativedelta(months=month_shift)
#                 build export path
                month_str = str(shifted_date_dt.month).zfill(2)
                file_name = f'{shifted_date_dt.year}{month_str}_{climate_type}.pkl'
                relative_date_name = f'{climate_type}_{month_shift}_months before'
                time_step_file_path = os.path.join(time_step_folder,file_name)
#                 open file and name col name relative
                try:
                    point_sample_df = pd.read_pickle(time_step_file_path)

                    point_sample_df.rename(columns={point_sample_df.columns[0]: relative_date_name}, inplace=True)

                    point_sample_df_list.append(point_sample_df)
                except Exception as e:
                    print(e)
#     combine all data
    all_time_steps = pd.concat(point_sample_df_list,axis=1)
    
    all_time_steps_with_point_data = pd.concat([dam_points['dam_area'],all_time_steps],axis=1)
#     make sure all data is in correct order
    all_time_steps_with_point_data = all_time_steps_with_point_data.reindex(meta_dict['col_names'], axis=1)

    return all_time_steps_with_point_data        

In [23]:
# run a test extraction
test_df = build_inf_df('202101')
test_df.head()

Unnamed: 0,dam_area,rain_0_months before,rain_10_months before,rain_11_months before,rain_12_months before,rain_1_months before,rain_2_months before,rain_3_months before,rain_4_months before,rain_5_months before,...,tavg_12_months before,tavg_1_months before,tavg_2_months before,tavg_3_months before,tavg_4_months before,tavg_5_months before,tavg_6_months before,tavg_7_months before,tavg_8_months before,tavg_9_months before
0,1782.758036,91.389145,102.111046,15.94266,34.318153,41.518482,29.074463,66.378281,41.120415,80.803795,...,24.253502,20.431652,20.566591,15.726265,12.332435,8.657789,8.072989,8.438752,10.184371,14.45916
1,1917.350912,91.599861,60.374527,86.209282,103.682495,92.023117,43.234932,137.324905,85.224243,133.838364,...,18.292,15.620107,16.618374,13.130466,11.56372,9.255119,8.712623,9.055582,10.510744,13.301476
2,2300.790143,94.956673,101.596077,55.016685,65.366745,87.820847,52.842342,129.010971,107.157036,113.654251,...,15.99195,13.331125,13.894073,10.666141,9.386144,7.773449,7.490423,8.403605,9.665147,11.672211
3,403.480805,25.214075,10.15592,46.595879,18.252161,24.598694,21.089201,67.21167,64.161491,58.338249,...,20.68634,17.684,19.030613,13.551085,12.200335,9.097849,8.387416,8.973354,10.934418,14.367694
4,1774.881116,48.494244,111.08374,99.955391,46.732124,128.312881,73.00634,86.361069,71.220161,54.039902,...,24.156582,18.429838,17.916143,14.671591,11.413791,7.220846,7.222924,7.85137,9.137348,13.59121


In [24]:
# the above df should have these cols
meta_dict['col_names']

['dam_area',
 'rain_0_months before',
 'rain_10_months before',
 'rain_11_months before',
 'rain_12_months before',
 'rain_1_months before',
 'rain_2_months before',
 'rain_3_months before',
 'rain_4_months before',
 'rain_5_months before',
 'rain_6_months before',
 'rain_7_months before',
 'rain_8_months before',
 'rain_9_months before',
 'tavg_0_months before',
 'tavg_10_months before',
 'tavg_11_months before',
 'tavg_12_months before',
 'tavg_1_months before',
 'tavg_2_months before',
 'tavg_3_months before',
 'tavg_4_months before',
 'tavg_5_months before',
 'tavg_6_months before',
 'tavg_7_months before',
 'tavg_8_months before',
 'tavg_9_months before']

In [25]:
# make a list of the start times
time_steps = []
for year in range(min_year,max_year+1):
#     loop over each month
    for month in range(1,13):
        month = str(month).zfill(2)
        time_steps.append(f'{year}{month}')
len(time_steps)

504

In [30]:
# func to run the model when given a start time
def run_model(time_step):
#     build inf df
    ts_df = build_inf_df(time_step)
#     loop over each model, and combine preds
    preds_df = pd.DataFrame()
    for model in xgb_reg_model:
        preds = model['model'].predict(ts_df)
        preds = preds*(preds>0)
        preds_df[model['model_number']] = preds
        
    preds_mean = preds_df.mean(axis=1).to_list()
    
#     make df with just files names to index to
    limited_df = pd.DataFrame(dam_points['file_name'].copy())
#     place the preds into df with time a col heading
    limited_df[time_step] = preds_mean
#     build export path
    file_name = f'{time_step}.pkl'
    pkl_out_path = os.path.join(output_folder,file_name)
#     export to disk
    limited_df.to_pickle(pkl_out_path)
    
    return pkl_out_path

In [31]:
with Pool(10) as p:
    monthly_ouputs = list(tqdm(p.imap(run_model,time_steps),total=len(time_steps)))

  0%|          | 0/504 [00:00<?, ?it/s]

In [32]:
monthly_ouputs = glob.glob(output_folder+'/*.pkl')

In [33]:
# get a list of all prediction files
len(monthly_ouputs)

1

In [34]:
dam_points.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349)
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117)
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943)
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662)
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746)


In [29]:
# make empty df to load preds into, we need to keep this separate from the base data we we can sort it by date
# this is a slow way to do this but it keep the RAM useage low
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
preds_df = pd.DataFrame()
# loop over each pred csv
for pkl in tqdm(monthly_ouputs):
#     get the file name
    file_name = os.path.basename(pkl)
#     get the date from the name
    date = file_name.replace('.pkl','')
#     open file
    current_df = pd.read_pickle(pkl)
#     extract the preds col
    preds = current_df[date].to_list()
#     name date a proper python date
    date_fixed = datetime.strptime(date, '%Y%m')
#     load preds into main df
    preds_df[date_fixed] = preds
# sort the cols by date
preds_df = preds_df[sorted(preds_df.columns)]

  0%|          | 0/504 [00:00<?, ?it/s]

In [25]:
# set any negative values to 0
preds_df[preds_df < 0] = 0
preds_df.head()

Unnamed: 0,1980-01-01,1980-02-01,1980-03-01,1980-04-01,1980-05-01,1980-06-01,1980-07-01,1980-08-01,1980-09-01,1980-10-01,...,2021-03-01,2021-04-01,2021-05-01,2021-06-01,2021-07-01,2021-08-01,2021-09-01,2021-10-01,2021-11-01,2021-12-01
0,790.933594,655.767883,610.735291,653.936096,772.319763,637.231506,649.77832,676.638489,760.892395,808.967712,...,746.420288,753.646912,731.979919,834.380493,810.75531,883.788574,958.149353,991.231323,849.591125,986.701355
1,1157.437256,1106.576416,1070.682739,1050.769775,1095.803101,1117.935669,1131.236084,1054.339844,1185.74939,1177.228149,...,1033.967773,1084.389526,1127.123291,1097.043701,1104.380127,1120.901123,1179.092529,1199.127319,1197.629272,1198.309937
2,1329.382812,1340.727173,1306.145874,1397.896729,1397.61499,1423.996948,1294.285645,1397.654663,1488.581787,1377.621826,...,1289.162964,1487.070801,1439.406982,1413.046387,1474.113037,1231.741821,1392.328613,1380.409302,1503.986328,1456.081543
3,128.22789,77.150169,88.893326,57.400185,93.619118,74.195648,172.957626,108.218353,83.857674,137.181564,...,103.824539,94.733353,116.369263,69.135582,106.265602,128.649338,170.825928,155.939804,151.296387,110.025597
4,690.196533,873.216187,724.159546,669.247375,745.380371,765.499634,642.054138,732.755859,733.004089,672.273071,...,979.035583,960.873047,988.989746,891.002197,997.48584,992.384521,1020.857727,1007.961914,1017.267273,1071.487549


In [26]:
# join the input dam points data with the preds
preds_with_meta = pd.concat([dam_points, preds_df], axis=1)
preds_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,1980-01-01 00:00:00,1980-02-01 00:00:00,...,2021-03-01 00:00:00,2021-04-01 00:00:00,2021-05-01 00:00:00,2021-06-01 00:00:00,2021-07-01 00:00:00,2021-08-01 00:00:00,2021-09-01 00:00:00,2021-10-01 00:00:00,2021-11-01 00:00:00,2021-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),790.933594,655.767883,...,746.420288,753.646912,731.979919,834.380493,810.75531,883.788574,958.149353,991.231323,849.591125,986.701355
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),1157.437256,1106.576416,...,1033.967773,1084.389526,1127.123291,1097.043701,1104.380127,1120.901123,1179.092529,1199.127319,1197.629272,1198.309937
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),1329.382812,1340.727173,...,1289.162964,1487.070801,1439.406982,1413.046387,1474.113037,1231.741821,1392.328613,1380.409302,1503.986328,1456.081543
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),128.22789,77.150169,...,103.824539,94.733353,116.369263,69.135582,106.265602,128.649338,170.825928,155.939804,151.296387,110.025597
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),690.196533,873.216187,...,979.035583,960.873047,988.989746,891.002197,997.48584,992.384521,1020.857727,1007.961914,1017.267273,1071.487549


In [33]:
# convert col headings to strings to improve file compatability
str_names_dict = {}
for name in preds_with_meta.columns.to_list():
    str_names_dict[name] = str(name)
    
preds_with_meta = preds_with_meta.rename(index=str, columns=str_names_dict)
preds_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,1980-01-01 00:00:00,1980-02-01 00:00:00,...,2021-03-01 00:00:00,2021-04-01 00:00:00,2021-05-01 00:00:00,2021-06-01 00:00:00,2021-07-01 00:00:00,2021-08-01 00:00:00,2021-09-01 00:00:00,2021-10-01 00:00:00,2021-11-01 00:00:00,2021-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),790.933594,655.767883,...,746.420288,753.646912,731.979919,834.380493,810.75531,883.788574,958.149353,991.231323,849.591125,986.701355
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),1157.437256,1106.576416,...,1033.967773,1084.389526,1127.123291,1097.043701,1104.380127,1120.901123,1179.092529,1199.127319,1197.629272,1198.309937
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),1329.382812,1340.727173,...,1289.162964,1487.070801,1439.406982,1413.046387,1474.113037,1231.741821,1392.328613,1380.409302,1503.986328,1456.081543
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),128.22789,77.150169,...,103.824539,94.733353,116.369263,69.135582,106.265602,128.649338,170.825928,155.939804,151.296387,110.025597
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),690.196533,873.216187,...,979.035583,960.873047,988.989746,891.002197,997.48584,992.384521,1020.857727,1007.961914,1017.267273,1071.487549


In [30]:
# make csv export path
final_export = os.path.join(dam_forcast_working_dir,'Dam forecast preds v6.csv')
final_export

'/mnt/2TB Working/Projects/Dam forecast/v5/Dam forecast preds v6.csv'

In [31]:
# make gpkg export path
gpkg_file_name = final_export.replace('.csv','.gpkg')
gpkg_file_name

'/mnt/2TB Working/Projects/Dam forecast/v5/Dam forecast preds v6.gpkg'

In [26]:
# export files
preds_with_meta.to_csv(final_export)
final_export

In [26]:
preds_with_meta.to_file(gpkg_file_name,driver='GPKG')
gpkg_file_name