In [4]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import pandas as pd
import pickle
import glob
from tqdm.auto import tqdm
import os
from datetime import datetime
import json

In [5]:
# point this at the folder full of csv files from the extraction script
tile_step_folder = '/mnt/2TB Working/Projects/Dam forecast/v4/forecast/time steps'
# the output location for this script
output_folder = '/mnt/2TB Working/Projects/Dam forecast/v4/forecast/preds'
# location of model file
model_folder = '/mnt/2TB Working/Projects/Dam forecast/v4'
model_name = 'dam_forecast_23-11-21.pkl'
model_path = os.path.join(model_folder,model_name)
'found model?',os.path.isfile(model_path)

('found model?', True)

In [6]:
# load in data from meta file
with open(model_path.replace('.pkl','.json'), 'r') as fp:
    meta_dict = json.load(fp)

In [5]:
# load up model
xgb_reg_model = pickle.load(open(model_path, "rb"))

In [6]:
# get a refence to each csv file
time_steps = glob.glob(tile_step_folder+'/*.csv')
len(time_steps)

120

In [7]:
# work out which cols we need to remove, other the model wont know whats happening
cols_to_keep = meta_dict['col_names']
ts_df = pd.read_csv(time_steps[0])
drop_cols = []
for col in ts_df.columns:
    if col not in cols_to_keep:
        drop_cols.append(col)

In [8]:
# loop over each csv file and run model
for time_step in tqdm(time_steps):
#     read csv file
    full_ts_df = pd.read_csv(time_step)
#     drop cols we dont need
    ts_df = full_ts_df.drop(columns=drop_cols)
#     make sure the cols are in the same order as when it was trained
    ts_df = ts_df.reindex(sorted(ts_df.columns), axis=1)
#     run model
    preds = xgb_reg_model.predict(ts_df)
        
#     grab the filename which should also tell us the date
    file_name = os.path.basename(time_step)
    date = file_name.replace('.csv','')    
#     make a new df with just files names as a reference
    limited_df = full_ts_df[['file_name']].copy()
#     place the preds into the new df
    limited_df[date] = preds
#     save the df out to disk
    csv_out_path = os.path.join(output_folder,file_name)
    limited_df.to_csv(csv_out_path)
    

  0%|          | 0/120 [00:00<?, ?it/s]

In [9]:
# get a list of all prediction files
outputs_csvs = glob.glob(output_folder+'/*.csv')
len(outputs_csvs)

120

In [10]:
# open one input csv and read the non weather data
base_data = pd.read_csv(time_steps[0])
drop_list = []
for col in base_data.columns:
    if 'before' in col:
        drop_list.append(col)
    if 'Unnamed' in col:
        drop_list.append(col)
        
base_data = base_data.drop(columns=drop_list)
base_data.head()

Unnamed: 0,file_name,area,area_2,SRC_DATE,file_name_pred,class,class_name,geometry
0,arcgisonline_1686584.tif,1472.265188,293.852789,20160217.0,arcgisonline_1686584_pred_and_score.tif,0,dam and water,POINT (121.8136426111555 -33.00994748470646)
1,arcgisonline_1686963.tif,3511.531725,1348.727385,20150121.0,arcgisonline_1686963_pred_and_score.tif,0,dam and water,POINT (121.5687803560598 -33.11646584652933)
2,arcgisonline_437392.tif,7400.931025,,20161230.0,arcgisonline_437392_pred_and_score.tif,2,no water,POINT (147.1971736181173 -29.86992984418018)
3,arcgisonline_1747284.tif,2925.907475,2924.952538,20160302.0,arcgisonline_1747284_pred_and_score.tif,1,no dam,POINT (147.4819841980934 -42.99338319262259)
4,arcgisonline_411890.tif,5270.39767,15.310323,20150315.0,arcgisonline_411890_pred_and_score.tif,2,no water,POINT (144.992580889845 -34.25548900296211)


In [11]:
# make empty df to load preds into, we need to keep this separate from the base data we we can sort it by date 
preds_df = pd.DataFrame()
# loop over each pred csv
for csv in tqdm(outputs_csvs):
#     get the file name
    file_name = os.path.basename(csv)
#     get the date from the name
    date = file_name.replace('.csv','')
#     open file
    current_df = pd.read_csv(csv)
#     extract the preds col
    preds = current_df[date].to_list()
#     name date a proper python date
    date_fixed = datetime.strptime(date, '%Y-%m')
#     load preds into main df
    preds_df[date_fixed] = preds
# sort the cols by date
preds_df = preds_df[sorted(preds_df.columns)]

  0%|          | 0/120 [00:00<?, ?it/s]

  app.launch_new_instance()


In [12]:
# have a look at the df and make sure it looks good
preds_df.head()

Unnamed: 0,2011-01-01,2011-02-01,2011-03-01,2011-04-01,2011-05-01,2011-06-01,2011-07-01,2011-08-01,2011-09-01,2011-10-01,...,2020-03-01,2020-04-01,2020-05-01,2020-06-01,2020-07-01,2020-08-01,2020-09-01,2020-10-01,2020-11-01,2020-12-01
0,110.85137,158.93771,135.37556,174.14394,248.0782,251.61627,160.77977,170.73775,128.36656,272.68427,...,112.9549,147.74187,158.13924,48.90754,251.9874,254.27902,229.15556,249.95795,130.4924,279.4671
1,986.4987,1004.3118,783.4278,840.6355,637.389,687.73334,689.98193,913.81793,883.188,1061.0504,...,660.58887,557.06476,1121.6897,729.6852,679.86444,717.4055,658.72125,723.2383,750.75574,641.2314
2,3771.777,4333.3096,4195.4985,3063.5037,4462.9375,4086.7673,4482.805,3314.652,3450.3079,2962.835,...,2262.5144,3125.919,3401.7087,3969.8037,3402.1606,2814.8416,2916.503,1241.2765,2473.93,2293.2986
3,1610.1354,1557.6421,1653.8916,1685.155,1587.2556,1650.9581,1792.3448,1799.7651,1432.6643,1551.1932,...,1036.5771,874.4764,1293.2852,1353.5822,1298.9188,1603.8694,1323.4735,1740.5289,1773.8369,1794.5408
4,3298.7012,3308.745,3177.3223,3302.8086,2992.125,2418.5566,2799.3367,2801.649,2754.4136,2269.6582,...,1143.6658,1918.0707,1957.5326,1832.511,1462.479,2087.6624,2190.707,1690.8441,1627.6273,1221.1123


In [13]:
# join the base data with the preds
preds_with_meta = pd.concat([base_data, preds_df], axis=1)

In [14]:
# export out to csv file
final_export = os.path.join(output_folder,'full preds.csv')
preds_with_meta.to_csv(final_export)