In [8]:
# This notebook applies the GXBoost model to a years worth a cliamte data.

In [1]:
import pickle
import glob
from tqdm.auto import tqdm
import os
from datetime import datetime
import dateutil.relativedelta
import json
from pathlib import Path
from threading import Thread
import warnings

import geopandas as gpd
import pandas as pd
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from multiprocessing.pool import ThreadPool

In [75]:
model_name = "dam_forecast_15-10-24.pkl"

In [3]:
# read in vars
%store -r dam_forcast_working_dir
%store -r min_year
%store -r max_year
%store -r climate_types
%store -r point_data_input_path
%store -r dam_forcast_working_dir
%store -r time_step_folder
# set up some folders
output_folder = os.path.join(dam_forcast_working_dir,'monthly output')
Path(output_folder).mkdir(exist_ok=True)
# location of model file
model_folder = os.path.join(os.getcwd(),'data')
model_path = os.path.join(model_folder,model_name)
'found model?',os.path.isfile(model_path)

('found model?', True)

In [4]:
# load in data from meta file
with open(model_path.replace(".pkl", ".json"), "r") as fp:
    meta_dict = json.load(fp)

In [5]:
# load up model
xgb_reg_models = pickle.load(open(model_path, "rb"))

In [6]:
# get a refence to each time step
time_steps = glob.glob(time_step_folder + "/*.pkl")
len(time_steps)

192

In [7]:
# open vector data
dam_points = gpd.read_file(point_data_input_path)
dam_points.head()
# reproject to WGS 84 if not already
if dam_points.crs.to_epsg() != 4326:
    print("Reprojecting")
    dam_points = dam_points.to_crs("EPSG:4326")

Reprojecting


In [11]:
# this func takes in input date and return 13 months worth climate data for each point
def build_inf_df(start_date):
    #     convert string date to datetime
    start_date_dt = datetime.strptime(start_date, "%Y%m")
    point_sample_df_list = []
    #     loop over each calimte and relative month
    for climate_type in climate_types:
        for month_shift in range(0, 13):
            #                 get actual date from relative month
            shifted_date_dt = start_date_dt - dateutil.relativedelta.relativedelta(
                months=month_shift
            )
            #                 build export path
            month_str = str(shifted_date_dt.month).zfill(2)
            file_name = f"{shifted_date_dt.year}{month_str}_{climate_type}.pkl"
            relative_date_name = f"{climate_type}_{month_shift}_months before"
            time_step_file_path = os.path.join(time_step_folder, file_name)
            #                 open file and name col name relative
            try:
                point_sample_df = pd.read_pickle(time_step_file_path)

                point_sample_df.rename(
                    columns={point_sample_df.columns[0]: relative_date_name},
                    inplace=True,
                )

                point_sample_df_list.append(point_sample_df)
            except Exception as e:
                print(e)
    #     combine all data
    all_time_steps = pd.concat(point_sample_df_list, axis=1)

    all_time_steps_with_point_data = pd.concat(
        [dam_points["dam_area"], all_time_steps], axis=1
    )
    #     make sure all data is in correct order
    all_time_steps_with_point_data = all_time_steps_with_point_data.reindex(
        meta_dict["col_names"], axis=1
    )

    return all_time_steps_with_point_data

In [12]:
# run a test extraction
test_df = build_inf_df("202101")
test_df.head()

Unnamed: 0,dam_area,rain_0_months before,rain_10_months before,rain_11_months before,rain_12_months before,rain_1_months before,rain_2_months before,rain_3_months before,rain_4_months before,rain_5_months before,...,tavg_12_months before,tavg_1_months before,tavg_2_months before,tavg_3_months before,tavg_4_months before,tavg_5_months before,tavg_6_months before,tavg_7_months before,tavg_8_months before,tavg_9_months before
0,1782.758036,91.389145,102.111046,15.94266,34.318153,41.518482,29.074463,66.378281,41.120415,80.803795,...,24.253502,20.431652,20.566591,15.726265,12.332435,8.657789,8.072989,8.438752,10.184371,14.45916
1,1917.350912,91.599861,60.374527,86.209282,103.682495,92.023117,43.234932,137.324905,85.224243,133.838364,...,18.292,15.620107,16.618374,13.130466,11.56372,9.255119,8.712623,9.055582,10.510744,13.301476
2,2300.790143,94.956673,101.596077,55.016685,65.366745,87.820847,52.842342,129.010971,107.157036,113.654251,...,15.99195,13.331125,13.894073,10.666141,9.386144,7.773449,7.490423,8.403605,9.665147,11.672211
3,403.480805,25.214075,10.15592,46.595879,18.252161,24.598694,21.089201,67.21167,64.161491,58.338249,...,20.68634,17.684,19.030613,13.551085,12.200335,9.097849,8.387416,8.973354,10.934418,14.367694
4,1774.881116,48.494244,111.08374,99.955391,46.732124,128.312881,73.00634,86.361069,71.220161,54.039902,...,24.156582,18.429838,17.916143,14.671591,11.413791,7.220846,7.222924,7.85137,9.137348,13.59121


In [13]:
# the above df should have these cols
meta_dict["col_names"]

['dam_area',
 'rain_0_months before',
 'rain_10_months before',
 'rain_11_months before',
 'rain_12_months before',
 'rain_1_months before',
 'rain_2_months before',
 'rain_3_months before',
 'rain_4_months before',
 'rain_5_months before',
 'rain_6_months before',
 'rain_7_months before',
 'rain_8_months before',
 'rain_9_months before',
 'tavg_0_months before',
 'tavg_10_months before',
 'tavg_11_months before',
 'tavg_12_months before',
 'tavg_1_months before',
 'tavg_2_months before',
 'tavg_3_months before',
 'tavg_4_months before',
 'tavg_5_months before',
 'tavg_6_months before',
 'tavg_7_months before',
 'tavg_8_months before',
 'tavg_9_months before']

In [14]:
# make a list of the start times
time_steps = []
for year in range(min_year, max_year + 1):
    #     loop over each month
    for month in range(1, 13):
        month = str(month).zfill(2)
        time_steps.append(f"{year}{month}")
len(time_steps)

36

In [33]:
time_step = time_steps[0]
ts_df = build_inf_df(time_step)
pred_month_tavg = ts_df["tavg_0_months before"]
# make into df with date as col name
pred_month_tavg = pd.DataFrame(pred_month_tavg)
pred_month_tavg.columns = [f"{time_step} tavg"]
# pd.DataFrame(pred_month_tavg)
pred_month_tavg

Unnamed: 0,202101 tavg
0,23.394915
1,18.056446
2,15.724895
3,20.996769
4,19.666962
...,...
1246412,20.065868
1246413,21.775904
1246414,25.483133
1246415,22.691793


In [41]:
# func to run the model when given a start time
def run_model(time_step):
    #     build inf df
    ts_df = build_inf_df(time_step)
    # grab the pred month tavg for later processing
    pred_month_tavg = ts_df["tavg_0_months before"]
    pred_month_tavg = pd.DataFrame(pred_month_tavg)
    pred_month_tavg.columns = [f"{time_step} tavg"]
    #     loop over each model, and combine preds
    preds_df = pd.DataFrame()
    for model in xgb_reg_models:
        preds = model["model"].predict(ts_df)
        preds = preds * (preds > 0)
        preds_df[model["model_number"]] = preds

    preds_mean = preds_df.mean(axis=1).to_list()

    #     make df with just files names to index to
    limited_df = pd.DataFrame(dam_points["file_name"].copy())
    #     place the preds into df with time a col heading
    limited_df[time_step] = preds_mean
    #     build export path
    file_name = f"{time_step}_pred.pkl"
    pkl_out_path = os.path.join(output_folder, file_name)

    tavg_file_name = f"{time_step}_tavg.pkl"
    tavg_pkl_out_path = os.path.join(output_folder, tavg_file_name)

    #     export to disk
    limited_df.to_pickle(pkl_out_path)
    pred_month_tavg.to_pickle(tavg_pkl_out_path)

    return pkl_out_path, tavg_pkl_out_path

In [42]:
with ThreadPool(2) as p:
    monthly_outputs = list(tqdm(p.imap(run_model, time_steps), total=len(time_steps)))

  0%|          | 0/36 [00:00<?, ?it/s]

In [44]:
monthly_pred = []
monthly_tavg = []
for pred, tavg in monthly_outputs:
    monthly_pred.append(pred)
    monthly_tavg.append(tavg)
assert len(monthly_pred) == len(time_steps)
len(time_steps)

36

In [54]:
# get a list of all prediction files
len(monthly_ouputs)

36

In [46]:
dam_points.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349)
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117)
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943)
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662)
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746)


In [58]:
# make empty df to load preds into, we need to keep this separate from the base data so we can sort it by date
# warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
# preds_df = pd.DataFrame()
all_preds = {}
all_temps = {}
# loop over each pred csv
for pred_path, tavg_path in tqdm(monthly_outputs):
    #     get the file name
    file_name = os.path.basename(pred_path)
    #     get the date from the name
    date = file_name.replace("_pred.pkl", "")
    #     open file
    preds = pd.read_pickle(pred_path)[date].to_list()
    temps = pd.read_pickle(tavg_path)[f"{date} tavg"]
    #     name date a proper python date
    date_fixed = datetime.strptime(date, "%Y%m")
    #     load preds into main df
    all_preds[date_fixed] = preds
    all_temps[date_fixed] = temps
    # preds_df[date_fixed] = preds
# sort the cols by date

  0%|          | 0/36 [00:00<?, ?it/s]

In [53]:
preds_df = pd.DataFrame(all_preds)
preds_df = preds_df[sorted(preds_df.columns)]
preds_df

Unnamed: 0,2021-01-01,2021-02-01,2021-03-01,2021-04-01,2021-05-01,2021-06-01,2021-07-01,2021-08-01,2021-09-01,2021-10-01,...,2023-03-01,2023-04-01,2023-05-01,2023-06-01,2023-07-01,2023-08-01,2023-09-01,2023-10-01,2023-11-01,2023-12-01
0,765.874512,847.972961,749.838196,605.308533,765.567383,771.985474,817.672974,951.386902,1037.939209,1032.813232,...,862.216431,809.222778,670.039917,933.242981,928.596802,890.365601,712.583374,705.275269,836.771301,699.068237
1,1140.886963,1076.730225,1013.740234,1019.048218,986.204590,1138.237183,1137.182617,1128.917236,1128.306519,1150.368286,...,1075.012939,986.214050,939.212219,921.366577,1150.405884,1185.731079,1058.899902,1141.486572,1031.736572,1152.135254
2,1334.021729,1309.985596,1202.851929,1305.883545,1423.440918,1226.459473,1391.183472,1228.584595,1318.343018,1354.052612,...,1272.070557,1267.041016,1097.546753,1155.979858,1095.693726,1213.050537,1509.002075,1448.021362,1393.711304,1505.985962
3,100.544395,93.530525,96.963310,84.411362,73.800797,98.903450,118.789368,140.702530,142.798035,158.138199,...,104.010971,97.298676,108.136131,147.315430,148.127228,169.204346,150.753738,130.708710,117.937454,120.220680
4,996.638062,973.759155,956.489441,448.283600,893.305969,679.194824,768.516479,814.229126,828.569336,1056.969360,...,925.588379,879.336121,729.499817,742.824402,830.408325,767.549377,848.057800,737.685547,665.722656,821.412415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246412,565.197144,592.918579,603.247559,596.380310,603.477600,654.280701,590.850403,713.936951,664.162842,769.160706,...,532.842651,576.668152,604.328125,643.961426,649.395264,501.776123,503.825775,470.636566,621.415344,811.633850
1246413,215.839554,445.517761,423.536469,47.179043,187.512787,271.026886,154.895844,228.381149,378.261932,399.627869,...,363.387665,386.698242,353.530212,460.240417,397.038910,360.278625,369.653625,419.796967,408.896576,402.586731
1246414,946.043335,807.684082,1018.773438,900.626465,911.997437,959.942017,1022.746216,924.926453,688.598206,886.816406,...,1532.449585,1505.593506,1061.563843,1308.988892,954.689087,1213.755493,834.801880,555.568481,350.765442,942.434753
1246415,5930.447266,6343.326660,6641.341309,4513.762695,5418.237793,5078.354980,6484.660645,5983.817383,6686.324219,6585.497559,...,5968.087402,5881.017090,5166.090820,6221.470703,6832.887695,4632.944336,6480.630859,5269.216309,5738.416504,5179.333496


In [59]:
tavg_df = pd.DataFrame(all_temps)
tavg_df = tavg_df[sorted(tavg_df.columns)]
tavg_df

Unnamed: 0,2021-01-01,2021-02-01,2021-03-01,2021-04-01,2021-05-01,2021-06-01,2021-07-01,2021-08-01,2021-09-01,2021-10-01,...,2023-03-01,2023-04-01,2023-05-01,2023-06-01,2023-07-01,2023-08-01,2023-09-01,2023-10-01,2023-11-01,2023-12-01
0,23.394915,21.645954,18.364866,14.249258,11.518285,8.928223,8.130416,9.540399,11.698494,13.588672,...,20.308769,14.827796,9.910368,9.371562,8.775784,10.045410,13.097185,14.001484,19.558014,22.467890
1,18.056446,17.379704,16.361622,13.914953,11.062444,9.535575,8.930173,10.181051,11.358953,12.538924,...,16.835918,14.049404,10.746694,9.874914,9.807578,10.204640,12.551055,12.570178,15.328485,16.990316
2,15.724895,15.665099,14.484789,11.772554,9.640451,9.081878,8.075239,8.613865,9.050312,10.901698,...,14.034088,12.396546,9.456347,8.694619,9.189066,8.877248,10.180424,10.435368,13.774740,15.087011
3,20.996769,19.797209,17.708242,14.610948,11.815395,10.366504,9.080671,10.547727,11.785295,12.647124,...,17.394182,14.657558,11.197517,10.559347,9.646945,10.366800,12.950374,13.223871,16.688864,18.861603
4,19.666962,19.025469,16.902338,12.522317,9.839494,7.163271,6.575868,8.133168,10.352775,12.714746,...,19.202953,13.119492,8.176756,7.547411,7.969513,9.240471,12.535453,14.465531,17.863266,21.130512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246412,20.065868,18.952520,17.514664,14.716192,11.785879,9.821359,9.573815,10.632910,12.453941,13.850285,...,18.704561,14.826365,11.104086,10.034918,10.051415,10.881070,13.740545,14.204240,17.187248,19.210045
1246413,21.775904,20.912369,17.933016,13.288532,10.585876,8.115474,7.284571,8.695584,10.990107,13.258316,...,19.806530,14.176953,8.984231,8.568007,8.322306,9.401345,12.716261,14.522363,18.968554,21.960922
1246414,25.483133,26.554119,24.722330,20.166204,17.002880,14.491276,14.151595,15.895869,18.025505,23.104153,...,25.356836,20.781359,15.160785,15.736105,14.768120,16.449141,19.095501,21.984009,24.141596,26.613298
1246415,22.691793,20.916340,17.760057,13.787441,11.115993,8.696679,7.886405,9.293661,11.400219,13.118352,...,19.661921,14.338357,9.583755,9.083367,8.599970,9.791127,12.780075,13.434568,18.937593,21.721962


In [60]:
# set any negative values to 0
preds_df[preds_df < 0] = 0
preds_df.head()

Unnamed: 0,2021-01-01,2021-02-01,2021-03-01,2021-04-01,2021-05-01,2021-06-01,2021-07-01,2021-08-01,2021-09-01,2021-10-01,...,2023-03-01,2023-04-01,2023-05-01,2023-06-01,2023-07-01,2023-08-01,2023-09-01,2023-10-01,2023-11-01,2023-12-01
0,765.874512,847.972961,749.838196,605.308533,765.567383,771.985474,817.672974,951.386902,1037.939209,1032.813232,...,862.216431,809.222778,670.039917,933.242981,928.596802,890.365601,712.583374,705.275269,836.771301,699.068237
1,1140.886963,1076.730225,1013.740234,1019.048218,986.20459,1138.237183,1137.182617,1128.917236,1128.306519,1150.368286,...,1075.012939,986.21405,939.212219,921.366577,1150.405884,1185.731079,1058.899902,1141.486572,1031.736572,1152.135254
2,1334.021729,1309.985596,1202.851929,1305.883545,1423.440918,1226.459473,1391.183472,1228.584595,1318.343018,1354.052612,...,1272.070557,1267.041016,1097.546753,1155.979858,1095.693726,1213.050537,1509.002075,1448.021362,1393.711304,1505.985962
3,100.544395,93.530525,96.96331,84.411362,73.800797,98.90345,118.789368,140.70253,142.798035,158.138199,...,104.010971,97.298676,108.136131,147.31543,148.127228,169.204346,150.753738,130.70871,117.937454,120.22068
4,996.638062,973.759155,956.489441,448.2836,893.305969,679.194824,768.516479,814.229126,828.569336,1056.96936,...,925.588379,879.336121,729.499817,742.824402,830.408325,767.549377,848.0578,737.685547,665.722656,821.412415


In [72]:
# get min and max dated from df col names
min_date = preds_df.columns.min()
max_date = preds_df.columns.max()
str(min_date.date()), str(max_date.date())

('2021-01-01', '2023-12-01')

In [61]:
# join the input dam points data with the preds
preds_with_meta = pd.concat([dam_points, preds_df], axis=1)
preds_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,2021-01-01 00:00:00,2021-02-01 00:00:00,...,2023-03-01 00:00:00,2023-04-01 00:00:00,2023-05-01 00:00:00,2023-06-01 00:00:00,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),765.874512,847.972961,...,862.216431,809.222778,670.039917,933.242981,928.596802,890.365601,712.583374,705.275269,836.771301,699.068237
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),1140.886963,1076.730225,...,1075.012939,986.21405,939.212219,921.366577,1150.405884,1185.731079,1058.899902,1141.486572,1031.736572,1152.135254
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),1334.021729,1309.985596,...,1272.070557,1267.041016,1097.546753,1155.979858,1095.693726,1213.050537,1509.002075,1448.021362,1393.711304,1505.985962
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),100.544395,93.530525,...,104.010971,97.298676,108.136131,147.31543,148.127228,169.204346,150.753738,130.70871,117.937454,120.22068
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),996.638062,973.759155,...,925.588379,879.336121,729.499817,742.824402,830.408325,767.549377,848.0578,737.685547,665.722656,821.412415


In [82]:
tavg_with_meta = pd.concat([dam_points, tavg_df], axis=1)
tavg_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,2021-01-01 00:00:00,2021-02-01 00:00:00,...,2023-03-01 00:00:00,2023-04-01 00:00:00,2023-05-01 00:00:00,2023-06-01 00:00:00,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),23.394915,21.645954,...,20.308769,14.827796,9.910368,9.371562,8.775784,10.04541,13.097185,14.001484,19.558014,22.46789
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),18.056446,17.379704,...,16.835918,14.049404,10.746694,9.874914,9.807578,10.20464,12.551055,12.570178,15.328485,16.990316
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),15.724895,15.665099,...,14.034088,12.396546,9.456347,8.694619,9.189066,8.877248,10.180424,10.435368,13.77474,15.087011
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),20.996769,19.797209,...,17.394182,14.657558,11.197517,10.559347,9.646945,10.3668,12.950374,13.223871,16.688864,18.861603
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),19.666962,19.025469,...,19.202953,13.119492,8.176756,7.547411,7.969513,9.240471,12.535453,14.465531,17.863266,21.130512


In [62]:
# convert col headings to strings to improve file compatability
str_names_dict = {}
for name in preds_with_meta.columns.to_list():
    str_names_dict[name] = str(name)

preds_with_meta = preds_with_meta.rename(index=str, columns=str_names_dict)
preds_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,2021-01-01 00:00:00,2021-02-01 00:00:00,...,2023-03-01 00:00:00,2023-04-01 00:00:00,2023-05-01 00:00:00,2023-06-01 00:00:00,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),765.874512,847.972961,...,862.216431,809.222778,670.039917,933.242981,928.596802,890.365601,712.583374,705.275269,836.771301,699.068237
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),1140.886963,1076.730225,...,1075.012939,986.21405,939.212219,921.366577,1150.405884,1185.731079,1058.899902,1141.486572,1031.736572,1152.135254
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),1334.021729,1309.985596,...,1272.070557,1267.041016,1097.546753,1155.979858,1095.693726,1213.050537,1509.002075,1448.021362,1393.711304,1505.985962
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),100.544395,93.530525,...,104.010971,97.298676,108.136131,147.31543,148.127228,169.204346,150.753738,130.70871,117.937454,120.22068
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),996.638062,973.759155,...,925.588379,879.336121,729.499817,742.824402,830.408325,767.549377,848.0578,737.685547,665.722656,821.412415


In [83]:
str_names_dict = {}
for name in tavg_with_meta.columns.to_list():
    str_names_dict[name] = str(name)

tavg_with_meta = tavg_with_meta.rename(index=str, columns=str_names_dict)
tavg_with_meta.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry,2021-01-01 00:00:00,2021-02-01 00:00:00,...,2023-03-01 00:00:00,2023-04-01 00:00:00,2023-05-01 00:00:00,2023-06-01 00:00:00,2023-07-01 00:00:00,2023-08-01 00:00:00,2023-09-01 00:00:00,2023-10-01 00:00:00,2023-11-01 00:00:00,2023-12-01 00:00:00
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (146.65881 -36.09349),23.394915,21.645954,...,20.308769,14.827796,9.910368,9.371562,8.775784,10.04541,13.097185,14.001484,19.558014,22.46789
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (145.80580 -38.45117),18.056446,17.379704,...,16.835918,14.049404,10.746694,9.874914,9.807578,10.20464,12.551055,12.570178,15.328485,16.990316
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (145.69447 -41.09943),15.724895,15.665099,...,14.034088,12.396546,9.456347,8.694619,9.189066,8.877248,10.180424,10.435368,13.77474,15.087011
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (141.22659 -36.64662),20.996769,19.797209,...,17.394182,14.657558,11.197517,10.559347,9.646945,10.3668,12.950374,13.223871,16.688864,18.861603
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (149.83683 -32.87746),19.666962,19.025469,...,19.202953,13.119492,8.176756,7.547411,7.969513,9.240471,12.535453,14.465531,17.863266,21.130512


In [84]:
# make csv export path
final_export_pred = os.path.join(
    dam_forcast_working_dir,
    f"Dam forecast preds v9 {str(min_date.date())} to {str(max_date.date())}.csv",
)
final_export_tavg = os.path.join(
    dam_forcast_working_dir,
    f"Dam forecast tavg v9 {str(min_date.date())} to {str(max_date.date())}.csv",
)
final_export, final_export_tavg

('/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast preds v9 2021-01-01 to 2023-12-01.csv',
 '/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast tavg v9 2021-01-01 to 2023-12-01.csv')

In [85]:
# make gpkg export path
gpkg_file_name_pred = final_export_pred.replace(".csv", ".gpkg")
gpkg_file_name_tavg = final_export_tavg.replace(".csv", ".gpkg")
gpkg_file_name_pred, gpkg_file_name_tavg

('/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast preds v9 2021-01-01 to 2023-12-01.gpkg',
 '/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast tavg v9 2021-01-01 to 2023-12-01.gpkg')

In [86]:
pkl_file_name_pred = final_export_pred.replace(".csv", ".pkl")
pkl_file_name_tavg = final_export_tavg.replace(".csv", ".pkl")
pkl_file_name_pred, pkl_file_name_tavg

('/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast preds v9 2021-01-01 to 2023-12-01.pkl',
 '/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast tavg v9 2021-01-01 to 2023-12-01.pkl')

In [87]:
# also export as parquet
parquet_file_name_pred = final_export_pred.replace(".csv", ".parquet")
parquet_file_name_tavg = final_export_tavg.replace(".csv", ".parquet")
parquet_file_name_pred, parquet_file_name_tavg

('/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast preds v9 2021-01-01 to 2023-12-01.parquet',
 '/home/nick/Documents/Work code/Weather-to-water/working/Dam forecast tavg v9 2021-01-01 to 2023-12-01.parquet')

In [88]:
preds_with_meta.to_pickle(pkl_file_name_pred)
tavg_with_meta.to_pickle(pkl_file_name_tavg)

In [89]:
# export files
preds_with_meta.to_csv(final_export_pred, index=False)
tavg_with_meta.to_csv(final_export_tavg, index=False)

In [92]:
preds_with_meta.to_file(gpkg_file_name_pred, driver="GPKG")
tavg_with_meta.to_file(gpkg_file_name_tavg, driver="GPKG")

In [91]:
preds_with_meta.to_parquet(parquet_file_name_pred)
tavg_with_meta.to_parquet(parquet_file_name_tavg)