# DHMZ

July 1, 2020 Changed forecasting model Aladin resolution 8km to 4km

In [13]:
import pandas as pd
from datetime import datetime, timedelta, timezone
import time
import numpy as np
import json
import pickle
from dateutil.relativedelta import relativedelta
import collections

In [14]:
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [23]:
meas = "temperature"

sampling_freq = "h"

no_future_forecasts = 24*3
no_last_lares_measurements = 4

In [16]:
name_pairs = {
    'temperature': 'weather_prediction_temperature_at_2m',
    'humidity': 'weather_prediction_relative_humidity_at_2m',
    'global_irradiance': 'weather_prediction_total_solar_irradiance',
    'precipitation': 'weather_prediction_total_precipitation',
}

In [17]:
columns_to_use = ["weather_prediction_timestamp", "weather_prediction_start_timestamp", name_pairs.get(meas)]
colnames = ["prediction_timestamp", "timestamp", meas]
# Load DHMZ data
dhmz_data = pd.read_csv(
    "dhmz.csv",
    engine="python", 
    on_bad_lines='skip',
    usecols=columns_to_use,
    parse_dates=["weather_prediction_timestamp", "weather_prediction_start_timestamp"],
    # names=colnames,
)
dhmz_data.columns = colnames

In [18]:
# cut off everything before model change
new_model_timestamp = datetime(year=2020, month=7, day=1)
df_for = dhmz_data.query('timestamp >= @new_model_timestamp').copy()
df_for.reset_index(drop=True);

In [19]:
# get temperature only 
# col_names = ["timestamp", meas]
# df_for = dhmz_data_4km[col_names].copy()
# df_for.reset_index(drop=True)

# switch forecasts sequences from string to np array
for for_val in [meas,]:
    df_for.loc[:, for_val] = df_for.loc[:, for_val].map(lambda x: np.array(json.loads(x)), na_action='ignore').copy()

# set timestmp as index
df_for.index = df_for.timestamp
df_for.index.name = 'timestamp'
df_for.drop(columns=['timestamp'],inplace=True)
df_for = df_for.sort_index()

In [20]:
# extract measurements to columns
df_cols_ext = [meas+'_forecast+'+str(i)+'h' for i in range(0,len(df_for[meas].values[0]))]

df_for = pd.DataFrame(index=df_for.index, data=df_for[meas].to_list(), columns=df_cols_ext)

In [21]:
# cut off unneeded predictions
# forecast_idx = np.where(df_for.columns == meas+"_forecast+72h")[0][0]
# forecast_columns = df_for.columns[:forecast_idx]
# forecast_columns

# df_for = df_for[forecast_columns].copy()

In [22]:
# resample dhmz data
df_for = df_for.asfreq(sampling_freq)
df_for = df_for.ffill()

df_for = reduce_mem_usage(df_for)

Mem. usage decreased to  3.46 Mb (74.0% reduction)


In [24]:
previous_columns = [meas + '_forecast-' + str(shift) + 'h' for shift in range(no_last_lares_measurements,0,-1)]
future_columns = df_for.columns.values.tolist()
future_values = df_for.values
df_for = pd.DataFrame(np.nan, index=df_for.index, columns=previous_columns + future_columns)
df_for.loc[:, future_columns] = future_values

In [25]:
# shift every next row by 1 (between forecasts)
previous_values = df_for.iloc[0, :no_last_lares_measurements].copy()
previous_values = collections.deque(previous_values, len(previous_values))
for ts in df_for.index:
    # DHMZ forecast arrives every 6 hours - shifting between
    column_shift = ts.hour % 6

    df_for.loc[ts] = df_for.loc[ts].shift(periods=-1*column_shift)
    df_for.loc[ts, previous_columns] = previous_values
    previous_values.append(df_for.loc[ts, future_columns[0]])

In [26]:
df_for

Unnamed: 0_level_0,temperature_forecast-4h,temperature_forecast-3h,temperature_forecast-2h,temperature_forecast-1h,temperature_forecast+0h,temperature_forecast+1h,temperature_forecast+2h,temperature_forecast+3h,temperature_forecast+4h,temperature_forecast+5h,...,temperature_forecast+63h,temperature_forecast+64h,temperature_forecast+65h,temperature_forecast+66h,temperature_forecast+67h,temperature_forecast+68h,temperature_forecast+69h,temperature_forecast+70h,temperature_forecast+71h,temperature_forecast+72h
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-01 00:00:00,,,,,21.296875,20.906250,20.796875,20.796875,21.000000,22.093750,...,22.406250,22.796875,22.796875,22.296875,21.500000,20.906250,20.593750,20.406250,20.093750,19.796875
2020-07-01 01:00:00,,,,21.296875,20.906250,20.796875,20.796875,21.000000,22.093750,23.703125,...,22.796875,22.796875,22.296875,21.500000,20.906250,20.593750,20.406250,20.093750,19.796875,
2020-07-01 02:00:00,,,21.296875,20.906250,20.796875,20.796875,21.000000,22.093750,23.703125,25.203125,...,22.796875,22.296875,21.500000,20.906250,20.593750,20.406250,20.093750,19.796875,,
2020-07-01 03:00:00,,21.296875,20.906250,20.796875,20.796875,21.000000,22.093750,23.703125,25.203125,26.500000,...,22.296875,21.500000,20.906250,20.593750,20.406250,20.093750,19.796875,,,
2020-07-01 04:00:00,21.296875,20.906250,20.796875,20.796875,21.000000,22.093750,23.703125,25.203125,26.500000,28.203125,...,21.500000,20.906250,20.593750,20.406250,20.093750,19.796875,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-10 02:00:00,10.898438,10.601562,8.000000,8.203125,8.296875,8.500000,8.500000,8.500000,8.703125,10.101562,...,7.500000,6.000000,5.199219,4.500000,4.101562,3.500000,2.900391,2.599609,,
2023-03-10 03:00:00,10.601562,8.000000,8.203125,8.296875,8.500000,8.500000,8.500000,8.703125,10.101562,12.296875,...,6.000000,5.199219,4.500000,4.101562,3.500000,2.900391,2.599609,,,
2023-03-10 04:00:00,8.000000,8.203125,8.296875,8.500000,8.500000,8.500000,8.703125,10.101562,12.296875,15.500000,...,5.199219,4.500000,4.101562,3.500000,2.900391,2.599609,,,,
2023-03-10 05:00:00,8.203125,8.296875,8.500000,8.500000,8.500000,8.703125,10.101562,12.296875,15.500000,17.593750,...,4.500000,4.101562,3.500000,2.900391,2.599609,,,,,


# EOF