In [1]:
# This notebook applies the GXBoost model to a years worth a cliamte data.

In [1]:
import pickle
import glob
from tqdm.auto import tqdm
import os
from datetime import datetime
import dateutil.relativedelta
import json
from pathlib import Path
from threading import Thread
import warnings

import geopandas as gpd
import pandas as pd
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from multiprocessing.pool import ThreadPool


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
model_name = "dam_forecast_03-11-23.pkl"

In [3]:
# read in vars
%store -r dam_forcast_working_dir
%store -r min_year
%store -r max_year
%store -r climate_types
%store -r point_data_input_path
%store -r dam_forcast_working_dir
%store -r time_step_folder
# set up some folders
output_folder = os.path.join(dam_forcast_working_dir,'monthly output')
Path(output_folder).mkdir(exist_ok=True)
# location of model file
model_folder = os.path.join(os.getcwd(),'data')
model_path = os.path.join(model_folder,model_name)
'found model?',os.path.isfile(model_path)

('found model?', True)

In [4]:
# load in data from meta file
with open(model_path.replace(".pkl", ".json"), "r") as fp:
    meta_dict = json.load(fp)

In [5]:
# load up model
xgb_reg_models = pickle.load(open(model_path, "rb"))

In [6]:
# get a refence to each time step
time_steps = glob.glob(time_step_folder + "/*.pkl")
len(time_steps)

1056

In [7]:
# open vector data
dam_points = gpd.read_file(point_data_input_path)
dam_points.head()
# reproject to WGS 84 if not already
if dam_points.crs.to_epsg() != 4326:
    print("Reprojecting")
    dam_points = dam_points.to_crs("EPSG:4326")

Reprojecting


In [8]:
# this func takes in input date and return 13 months worth climate data for each point
def build_inf_df(start_date):
    #     convert string date to datetime
    start_date_dt = datetime.strptime(start_date, "%Y%m")
    point_sample_df_list = []
    #     loop over each calimte and relative month
    for climate_type in climate_types:
        for month_shift in range(0, 13):
            #                 get actual date from relative month
            shifted_date_dt = start_date_dt - dateutil.relativedelta.relativedelta(
                months=month_shift
            )
            #                 build export path
            month_str = str(shifted_date_dt.month).zfill(2)
            file_name = f"{shifted_date_dt.year}{month_str}_{climate_type}.pkl"
            relative_date_name = f"{climate_type}_{month_shift}_months before"
            time_step_file_path = os.path.join(time_step_folder, file_name)
            #                 open file and name col name relative
            try:
                point_sample_df = pd.read_pickle(time_step_file_path)

                point_sample_df.rename(
                    columns={point_sample_df.columns[0]: relative_date_name},
                    inplace=True,
                )

                point_sample_df_list.append(point_sample_df)
            except Exception as e:
                print(e)
    #     combine all data
    all_time_steps = pd.concat(point_sample_df_list, axis=1)

    all_time_steps_with_point_data = pd.concat(
        [dam_points["dam_area"], all_time_steps], axis=1
    )
    #     make sure all data is in correct order
    all_time_steps_with_point_data = all_time_steps_with_point_data.reindex(
        meta_dict["col_names"], axis=1
    )

    return all_time_steps_with_point_data

In [9]:
# run a test extraction
test_df = build_inf_df("202101")
test_df.head()

Unnamed: 0,dam_area,rain_0_months before,rain_10_months before,rain_11_months before,rain_12_months before,rain_1_months before,rain_2_months before,rain_3_months before,rain_4_months before,rain_5_months before,...,tavg_12_months before,tavg_1_months before,tavg_2_months before,tavg_3_months before,tavg_4_months before,tavg_5_months before,tavg_6_months before,tavg_7_months before,tavg_8_months before,tavg_9_months before
0,1782.758036,91.389145,102.111046,15.94266,34.318153,41.518482,29.074463,66.378281,41.120415,80.803795,...,24.253502,20.431652,20.566591,15.726265,12.332435,8.657789,8.072989,8.438752,10.184371,14.45916
1,1917.350912,91.599861,60.374527,86.209282,103.682495,92.023117,43.234932,137.324905,85.224243,133.838364,...,18.292,15.620107,16.618374,13.130466,11.56372,9.255119,8.712623,9.055582,10.510744,13.301476
2,2300.790143,94.956673,101.596077,55.016685,65.366745,87.820847,52.842342,129.010971,107.157036,113.654251,...,15.99195,13.331125,13.894073,10.666141,9.386144,7.773449,7.490423,8.403605,9.665147,11.672211
3,403.480805,25.214075,10.15592,46.595879,18.252161,24.598694,21.089201,67.21167,64.161491,58.338249,...,20.68634,17.684,19.030613,13.551085,12.200335,9.097849,8.387416,8.973354,10.934418,14.367694
4,1774.881116,48.494244,111.08374,99.955391,46.732124,128.312881,73.00634,86.361069,71.220161,54.039902,...,24.156582,18.429838,17.916143,14.671591,11.413791,7.220846,7.222924,7.85137,9.137348,13.59121


In [10]:
# the above df should have these cols
meta_dict["col_names"]

['dam_area',
 'rain_0_months before',
 'rain_10_months before',
 'rain_11_months before',
 'rain_12_months before',
 'rain_1_months before',
 'rain_2_months before',
 'rain_3_months before',
 'rain_4_months before',
 'rain_5_months before',
 'rain_6_months before',
 'rain_7_months before',
 'rain_8_months before',
 'rain_9_months before',
 'tavg_0_months before',
 'tavg_10_months before',
 'tavg_11_months before',
 'tavg_12_months before',
 'tavg_1_months before',
 'tavg_2_months before',
 'tavg_3_months before',
 'tavg_4_months before',
 'tavg_5_months before',
 'tavg_6_months before',
 'tavg_7_months before',
 'tavg_8_months before',
 'tavg_9_months before']

In [11]:
# make a list of the start times
time_steps = []
for year in range(min_year, max_year + 1):
    #     loop over each month
    for month in range(1, 13):
        month = str(month).zfill(2)
        time_steps.append(f"{year}{month}")
len(time_steps)

516

In [12]:
# func to run the model when given a start time
def run_model(time_step):
    #     build inf df
    ts_df = build_inf_df(time_step)
    #     loop over each model, and combine preds
    preds_df = pd.DataFrame()
    for model in xgb_reg_models:
        preds = model["model"].predict(ts_df)
        preds = preds * (preds > 0)
        preds_df[model["model_number"]] = preds

    preds_mean = preds_df.mean(axis=1).to_list()

    #     make df with just files names to index to
    limited_df = pd.DataFrame(dam_points["file_name"].copy())
    #     place the preds into df with time a col heading
    limited_df[time_step] = preds_mean
    #     build export path
    file_name = f"{time_step}.pkl"
    pkl_out_path = os.path.join(output_folder, file_name)
    #     export to disk
    limited_df.to_pickle(pkl_out_path)

    return pkl_out_path

In [13]:
with ThreadPool(2) as p:
    monthly_ouputs = list(tqdm(p.imap(run_model, time_steps), total=len(time_steps)))

  0%|          | 0/516 [00:00<?, ?it/s]

In [14]:
monthly_ouputs = glob.glob(output_folder + "/*.pkl")

In [15]:
# get a list of all prediction files
len(monthly_ouputs)

516

In [16]:
dam_points.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349)
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117)
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943)
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662)
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746)


In [17]:
# make empty df to load preds into, we need to keep this separate from the base data so we can sort it by date
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
preds_df = pd.DataFrame()
# loop over each pred csv
for pkl in tqdm(monthly_ouputs):
    #     get the file name
    file_name = os.path.basename(pkl)
    #     get the date from the name
    date = file_name.replace(".pkl", "")
    #     open file
    current_df = pd.read_pickle(pkl)
    #     extract the preds col
    preds = current_df[date].to_list()
    #     name date a proper python date
    date_fixed = datetime.strptime(date, "%Y%m")
    #     load preds into main df
    preds_df[date_fixed] = preds
# sort the cols by date
preds_df = preds_df[sorted(preds_df.columns)]

  0%|          | 0/516 [00:00<?, ?it/s]

In [18]:
# set any negative values to 0
preds_df[preds_df < 0] = 0
preds_df.head()

Unnamed: 0,1980-01-01,1980-02-01,1980-03-01,1980-04-01,1980-05-01,1980-06-01,1980-07-01,1980-08-01,1980-09-01,1980-10-01,...,2022-03-01,2022-04-01,2022-05-01,2022-06-01,2022-07-01,2022-08-01,2022-09-01,2022-10-01,2022-11-01,2022-12-01
0,139.090164,440.722351,533.859436,225.291016,362.781769,686.803101,306.69223,583.081177,538.820435,769.32959,...,738.362427,720.732971,791.82428,862.342407,642.234436,801.303406,1006.093567,1083.208008,1052.922852,841.679077
1,1141.240601,1104.833008,1020.540039,1063.985718,1076.482544,1214.999756,1153.713867,1132.891846,1126.38269,1129.948853,...,1032.007935,981.14978,1082.205444,1019.574829,1148.150391,1002.561523,1092.068604,1119.369873,1139.743042,1074.500732
2,1232.107422,1225.967651,1220.685791,1165.686279,1011.339172,915.999878,988.188599,983.056458,1230.209351,1273.924805,...,1236.947266,1341.016602,1086.881714,1114.958374,1198.959229,1264.677002,1371.303711,1184.474365,1355.056396,1438.732666
3,121.82373,53.625244,53.633934,77.689629,55.531483,88.961731,70.950058,120.466148,13.224772,113.023132,...,61.714344,52.977379,96.109589,126.396461,129.619705,128.466125,141.96994,155.037476,162.826736,141.418976
4,434.31366,704.102966,471.895264,240.52063,339.484344,429.325378,505.257904,457.215271,238.559662,499.774414,...,1059.373047,914.175415,1021.35791,1038.465454,963.3078,909.59729,1100.491211,995.817505,953.03894,1148.715332


In [19]:
# join the input dam points data with the preds
preds_with_meta = pd.concat([dam_points, preds_df], axis=1)
preds_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,1980-01-01 00:00:00,1980-02-01 00:00:00,...,2022-03-01 00:00:00,2022-04-01 00:00:00,2022-05-01 00:00:00,2022-06-01 00:00:00,2022-07-01 00:00:00,2022-08-01 00:00:00,2022-09-01 00:00:00,2022-10-01 00:00:00,2022-11-01 00:00:00,2022-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),139.090164,440.722351,...,738.362427,720.732971,791.82428,862.342407,642.234436,801.303406,1006.093567,1083.208008,1052.922852,841.679077
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),1141.240601,1104.833008,...,1032.007935,981.14978,1082.205444,1019.574829,1148.150391,1002.561523,1092.068604,1119.369873,1139.743042,1074.500732
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),1232.107422,1225.967651,...,1236.947266,1341.016602,1086.881714,1114.958374,1198.959229,1264.677002,1371.303711,1184.474365,1355.056396,1438.732666
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),121.82373,53.625244,...,61.714344,52.977379,96.109589,126.396461,129.619705,128.466125,141.96994,155.037476,162.826736,141.418976
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),434.31366,704.102966,...,1059.373047,914.175415,1021.35791,1038.465454,963.3078,909.59729,1100.491211,995.817505,953.03894,1148.715332


In [20]:
# convert col headings to strings to improve file compatability
str_names_dict = {}
for name in preds_with_meta.columns.to_list():
    str_names_dict[name] = str(name)

preds_with_meta = preds_with_meta.rename(index=str, columns=str_names_dict)
preds_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,1980-01-01 00:00:00,1980-02-01 00:00:00,...,2022-03-01 00:00:00,2022-04-01 00:00:00,2022-05-01 00:00:00,2022-06-01 00:00:00,2022-07-01 00:00:00,2022-08-01 00:00:00,2022-09-01 00:00:00,2022-10-01 00:00:00,2022-11-01 00:00:00,2022-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),139.090164,440.722351,...,738.362427,720.732971,791.82428,862.342407,642.234436,801.303406,1006.093567,1083.208008,1052.922852,841.679077
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),1141.240601,1104.833008,...,1032.007935,981.14978,1082.205444,1019.574829,1148.150391,1002.561523,1092.068604,1119.369873,1139.743042,1074.500732
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),1232.107422,1225.967651,...,1236.947266,1341.016602,1086.881714,1114.958374,1198.959229,1264.677002,1371.303711,1184.474365,1355.056396,1438.732666
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),121.82373,53.625244,...,61.714344,52.977379,96.109589,126.396461,129.619705,128.466125,141.96994,155.037476,162.826736,141.418976
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),434.31366,704.102966,...,1059.373047,914.175415,1021.35791,1038.465454,963.3078,909.59729,1100.491211,995.817505,953.03894,1148.715332


In [21]:
# make csv export path
final_export = os.path.join(dam_forcast_working_dir, "Dam forecast preds v7.csv")
final_export

'/Volumes/4TB SSD/W2W/Dam forecast preds v7.csv'

In [22]:
# make gpkg export path
gpkg_file_name = final_export.replace(".csv", ".gpkg")
gpkg_file_name

'/Volumes/4TB SSD/W2W/Dam forecast preds v7.gpkg'

In [23]:
pkl_file_name = final_export.replace(".csv", ".pkl")
pkl_file_name

'/Volumes/4TB SSD/W2W/Dam forecast preds v7.pkl'

In [24]:
preds_with_meta.to_pickle(pkl_file_name)

In [25]:
# export files
preds_with_meta.to_csv(final_export)
final_export

'/Volumes/4TB SSD/W2W/Dam forecast preds v7.csv'

In [26]:
preds_with_meta.to_file(gpkg_file_name, driver="GPKG")
gpkg_file_name