### Time Series Forecasting using LSTM

In [1]:
# Importing libraries
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
import os
from convert_columns_to_floats import *

mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['axes.grid'] = False



In [2]:
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
orig_df = pd.read_csv('/Users/faymajidelhassan/Downloads/Master project /Data/Weather/forecasts/open_meteo.csv') 
df = orig_df.copy() 
print(f'Size of the dataset: {df.shape} \n')  
print() 
display(df.head(5))
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

Size of the dataset: (5292, 16) 




Unnamed: 0,timestamp,temperature,humidity,pressure_msl,pressure_surface,global_irradiance,direct_irradiance,diffuse_irradiance,cloud_cover,wind_speed,wind_direction,precipitation,rain,showers,snowfall,weather_code
0,2022-07-11 12:00:00,"[22.0, 22.6, 23.1, 23.4, 23.2, 22.7, 21.6, 20....","[42.0, 42.0, 41.0, 39.0, 40.0, 41.0, 49.0, 58....","[1021.5, 1021.0, 1020.6, 1020.2, 1019.7, 1019....","[1007.9, 1007.4, 1007.1, 1006.7, 1006.2, 1006....","[599.0, 645.0, 546.0, 507.0, 393.0, 245.0, 88....","[271.9, 390.3, 333.4, 422.8, 450.5, 389.1, 127...","[352.0, 307.0, 285.0, 226.0, 159.0, 106.0, 64....","[67.0, 80.0, 68.0, 54.0, 57.0, 90.0, 100.0, 10...","[6.2, 6.0, 5.8, 6.2, 6.6, 6.6, 3.2, 1.5, 4.0, ...","[173.0, 147.0, 150.0, 159.0, 158.0, 167.0, 153...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, ..."
1,2022-07-11 13:00:00,"[22.6, 23.1, 23.4, 23.2, 22.7, 21.6, 20.3, 18....","[42.0, 41.0, 39.0, 40.0, 41.0, 49.0, 58.0, 64....","[1021.0, 1020.6, 1020.2, 1019.7, 1019.8, 1020....","[1007.4, 1007.1, 1006.7, 1006.2, 1006.2, 1006....","[645.0, 546.0, 507.0, 393.0, 245.0, 88.0, 10.0...","[390.3, 333.4, 422.8, 450.5, 389.1, 127.0, 0.0...","[307.0, 285.0, 226.0, 159.0, 106.0, 64.0, 10.0...","[80.0, 68.0, 54.0, 57.0, 90.0, 100.0, 100.0, 1...","[6.0, 5.8, 6.2, 6.6, 6.6, 3.2, 1.5, 4.0, 2.9, ...","[147.0, 150.0, 159.0, 158.0, 167.0, 153.0, 104...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, ..."
2,2022-07-11 14:00:00,"[23.2, 23.6, 23.5, 23.0, 22.1, 20.4, 18.9, 18....","[38.0, 38.0, 38.0, 39.0, 46.0, 54.0, 63.0, 62....","[1020.6, 1020.3, 1019.6, 1019.6, 1020.0, 1020....","[1007.1, 1006.8, 1006.1, 1006.1, 1006.4, 1006....","[417.0, 442.0, 417.0, 267.0, 96.0, 11.0, 0.0, ...","[149.5, 287.4, 525.5, 481.5, 158.8, 0.0, 0.0, ...","[300.0, 251.0, 144.0, 95.0, 66.0, 11.0, 0.0, 0...","[86.0, 62.0, 24.0, 88.0, 100.0, 100.0, 100.0, ...","[4.0, 5.6, 6.8, 5.3, 4.5, 2.5, 2.9, 3.7, 1.3, ...","[170.0, 165.0, 155.0, 152.0, 166.0, 90.0, 7.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3.0, 2.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, ..."
3,2022-07-11 15:00:00,"[23.6, 23.5, 23.0, 22.1, 20.4, 18.9, 18.2, 18....","[38.0, 38.0, 39.0, 46.0, 54.0, 63.0, 62.0, 65....","[1020.3, 1019.6, 1019.6, 1020.0, 1020.5, 1021....","[1006.8, 1006.1, 1006.1, 1006.4, 1006.8, 1007....","[442.0, 417.0, 267.0, 96.0, 11.0, 0.0, 0.0, 0....","[287.4, 525.5, 481.5, 158.8, 0.0, 0.0, 0.0, 0....","[251.0, 144.0, 95.0, 66.0, 11.0, 0.0, 0.0, 0.0...","[62.0, 24.0, 88.0, 100.0, 100.0, 100.0, 92.0, ...","[5.6, 6.8, 5.3, 4.5, 2.5, 2.9, 3.7, 1.3, 2.1, ...","[165.0, 155.0, 152.0, 166.0, 90.0, 7.0, 11.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, ..."
4,2022-07-11 16:00:00,"[23.5, 23.0, 22.1, 20.4, 18.9, 18.2, 18.0, 17....","[38.0, 39.0, 46.0, 54.0, 63.0, 62.0, 65.0, 65....","[1019.6, 1019.6, 1020.0, 1020.5, 1021.1, 1021....","[1006.1, 1006.1, 1006.4, 1006.8, 1007.4, 1007....","[417.0, 267.0, 96.0, 11.0, 0.0, 0.0, 0.0, 0.0,...","[525.5, 481.5, 158.8, 0.0, 0.0, 0.0, 0.0, 0.0,...","[144.0, 95.0, 66.0, 11.0, 0.0, 0.0, 0.0, 0.0, ...","[24.0, 88.0, 100.0, 100.0, 100.0, 92.0, 90.0, ...","[6.8, 5.3, 4.5, 2.5, 2.9, 3.7, 1.3, 2.1, 3.1, ...","[155.0, 152.0, 166.0, 90.0, 7.0, 11.0, 146.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, ..."


In [4]:
features=[
    'temperature', 'humidity', 'pressure_msl', 'pressure_surface', 
    'global_irradiance', 'direct_irradiance', 'diffuse_irradiance', 
    'cloud_cover', 'wind_speed', 'wind_direction', 'precipitation', 
    'rain', 'showers', 'snowfall']
def convert_columns_to_floats2(df, columns_to_convert):
    for col in columns_to_convert:
        print(f"Processing column: {col}")
        df[col] = df[col].astype(str).apply(parse_complex_string)
        print(f"Processed column: {col}")
    return df
def reduce_cells_for_all_columns(df, columns_to_reduce,n=int):
    '''
    Reduce each cell in each column by 96 values from the end
    '''
    for col in columns_to_reduce:
        df[col] = df[col].apply(lambda x: x[:n] if len(x) > n else x + [None] * (n - len(x)))
    return df
def get_lengths_of_cells(df, columns):
    '''
    Get the lengths of cells in each specified column
    '''
    lengths = {col: df[col].apply(lambda x: len(x) if isinstance(x, list) else np.nan) for col in columns}
    return pd.DataFrame(lengths)
# df = convert_columns_to_numeric_lists(df, columns_to_convert2)
df= convert_columns_to_floats2(df, features)
lengths_df = get_lengths_of_cells(df, features)
df= reduce_cells_for_all_columns(df, features, 96)
# print("Lengths of cells before reduction:")
print(lengths_df)
df

Processing column: temperature
Processed column: temperature
Processing column: humidity
Processed column: humidity
Processing column: pressure_msl
Processed column: pressure_msl
Processing column: pressure_surface
Processed column: pressure_surface
Processing column: global_irradiance
Processed column: global_irradiance
Processing column: direct_irradiance
Processed column: direct_irradiance
Processing column: diffuse_irradiance
Processed column: diffuse_irradiance
Processing column: cloud_cover
Processed column: cloud_cover
Processing column: wind_speed
Processed column: wind_speed
Processing column: wind_direction
Processed column: wind_direction
Processing column: precipitation
Processed column: precipitation
Processing column: rain
Processed column: rain
Processing column: showers
Processed column: showers
Processing column: snowfall
Processed column: snowfall
                     temperature  humidity  pressure_msl  pressure_surface  \
timestamp                                   

Unnamed: 0_level_0,temperature,humidity,pressure_msl,pressure_surface,global_irradiance,direct_irradiance,diffuse_irradiance,cloud_cover,wind_speed,wind_direction,precipitation,rain,showers,snowfall,weather_code
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-07-11 12:00:00,"[22.0, 22.6, 23.1, 23.4, 23.2, 22.7, 21.6, 20....","[42.0, 42.0, 41.0, 39.0, 40.0, 41.0, 49.0, 58....","[1021.5, 1021.0, 1020.6, 1020.2, 1019.7, 1019....","[1007.9, 1007.4, 1007.1, 1006.7, 1006.2, 1006....","[599.0, 645.0, 546.0, 507.0, 393.0, 245.0, 88....","[271.9, 390.3, 333.4, 422.8, 450.5, 389.1, 127...","[352.0, 307.0, 285.0, 226.0, 159.0, 106.0, 64....","[67.0, 80.0, 68.0, 54.0, 57.0, 90.0, 100.0, 10...","[6.2, 6.0, 5.8, 6.2, 6.6, 6.6, 3.2, 1.5, 4.0, ...","[173.0, 147.0, 150.0, 159.0, 158.0, 167.0, 153...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, ..."
2022-07-11 13:00:00,"[22.6, 23.1, 23.4, 23.2, 22.7, 21.6, 20.3, 18....","[42.0, 41.0, 39.0, 40.0, 41.0, 49.0, 58.0, 64....","[1021.0, 1020.6, 1020.2, 1019.7, 1019.8, 1020....","[1007.4, 1007.1, 1006.7, 1006.2, 1006.2, 1006....","[645.0, 546.0, 507.0, 393.0, 245.0, 88.0, 10.0...","[390.3, 333.4, 422.8, 450.5, 389.1, 127.0, 0.0...","[307.0, 285.0, 226.0, 159.0, 106.0, 64.0, 10.0...","[80.0, 68.0, 54.0, 57.0, 90.0, 100.0, 100.0, 1...","[6.0, 5.8, 6.2, 6.6, 6.6, 3.2, 1.5, 4.0, 2.9, ...","[147.0, 150.0, 159.0, 158.0, 167.0, 153.0, 104...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, ..."
2022-07-11 14:00:00,"[23.2, 23.6, 23.5, 23.0, 22.1, 20.4, 18.9, 18....","[38.0, 38.0, 38.0, 39.0, 46.0, 54.0, 63.0, 62....","[1020.6, 1020.3, 1019.6, 1019.6, 1020.0, 1020....","[1007.1, 1006.8, 1006.1, 1006.1, 1006.4, 1006....","[417.0, 442.0, 417.0, 267.0, 96.0, 11.0, 0.0, ...","[149.5, 287.4, 525.5, 481.5, 158.8, 0.0, 0.0, ...","[300.0, 251.0, 144.0, 95.0, 66.0, 11.0, 0.0, 0...","[86.0, 62.0, 24.0, 88.0, 100.0, 100.0, 100.0, ...","[4.0, 5.6, 6.8, 5.3, 4.5, 2.5, 2.9, 3.7, 1.3, ...","[170.0, 165.0, 155.0, 152.0, 166.0, 90.0, 7.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3.0, 2.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, ..."
2022-07-11 15:00:00,"[23.6, 23.5, 23.0, 22.1, 20.4, 18.9, 18.2, 18....","[38.0, 38.0, 39.0, 46.0, 54.0, 63.0, 62.0, 65....","[1020.3, 1019.6, 1019.6, 1020.0, 1020.5, 1021....","[1006.8, 1006.1, 1006.1, 1006.4, 1006.8, 1007....","[442.0, 417.0, 267.0, 96.0, 11.0, 0.0, 0.0, 0....","[287.4, 525.5, 481.5, 158.8, 0.0, 0.0, 0.0, 0....","[251.0, 144.0, 95.0, 66.0, 11.0, 0.0, 0.0, 0.0...","[62.0, 24.0, 88.0, 100.0, 100.0, 100.0, 92.0, ...","[5.6, 6.8, 5.3, 4.5, 2.5, 2.9, 3.7, 1.3, 2.1, ...","[165.0, 155.0, 152.0, 166.0, 90.0, 7.0, 11.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, ..."
2022-07-11 16:00:00,"[23.5, 23.0, 22.1, 20.4, 18.9, 18.2, 18.0, 17....","[38.0, 39.0, 46.0, 54.0, 63.0, 62.0, 65.0, 65....","[1019.6, 1019.6, 1020.0, 1020.5, 1021.1, 1021....","[1006.1, 1006.1, 1006.4, 1006.8, 1007.4, 1007....","[417.0, 267.0, 96.0, 11.0, 0.0, 0.0, 0.0, 0.0,...","[525.5, 481.5, 158.8, 0.0, 0.0, 0.0, 0.0, 0.0,...","[144.0, 95.0, 66.0, 11.0, 0.0, 0.0, 0.0, 0.0, ...","[24.0, 88.0, 100.0, 100.0, 100.0, 92.0, 90.0, ...","[6.8, 5.3, 4.5, 2.5, 2.9, 3.7, 1.3, 2.1, 3.1, ...","[155.0, 152.0, 166.0, 90.0, 7.0, 11.0, 146.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-17 08:00:00,"[4.0, 7.3, 10.3, 12.4, 13.9, 14.9, 15.9, 15.6,...","[90.0, 75.0, 64.0, 58.0, 50.0, 48.0, 47.0, 48....","[1028.1, 1028.2, 1028.1, 1026.9, 1026.2, 1025....","[1013.5, 1013.8, 1013.8, 1012.8, 1012.1, 1011....","[170.0, 309.0, 415.0, 468.0, 491.0, 438.0, 337...","[258.1, 487.2, 617.8, 644.2, 673.1, 675.3, 573...","[76.0, 96.0, 112.0, 135.0, 119.0, 128.0, 126.0...","[0.0, 10.0, 0.0, 33.0, 100.0, 100.0, 0.0, 70.0...","[9.6, 10.1, 10.3, 10.5, 10.2, 10.0, 9.7, 11.8,...","[236.0, 235.0, 234.0, 239.0, 231.0, 240.0, 239...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 1, 1, 2, 3, 3, 0, 2, 2, 3, 2, 2, 2, 2, 2, ..."
2023-02-17 09:00:00,"[7.3, 10.3, 12.4, 13.9, 14.9, 15.9, 15.6, 13.6...","[75.0, 64.0, 58.0, 50.0, 48.0, 47.0, 48.0, 54....","[1028.2, 1028.1, 1026.9, 1026.2, 1025.5, 1025....","[1013.8, 1013.8, 1012.8, 1012.1, 1011.5, 1011....","[309.0, 415.0, 468.0, 491.0, 438.0, 337.0, 220...","[487.2, 617.8, 644.2, 673.1, 675.3, 573.8, 493...","[96.0, 112.0, 135.0, 119.0, 128.0, 126.0, 87.0...","[10.0, 0.0, 33.0, 100.0, 100.0, 0.0, 70.0, 64....","[10.1, 10.3, 10.5, 10.2, 10.0, 9.7, 11.8, 10.0...","[235.0, 234.0, 239.0, 231.0, 240.0, 239.0, 239...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1, 1, 2, 3, 3, 0, 2, 2, 3, 2, 2, 2, 2, 2, 1, ..."
2023-02-17 10:00:00,"[10.3, 12.4, 13.9, 14.9, 15.9, 15.6, 13.6, 10....","[64.0, 58.0, 50.0, 48.0, 47.0, 48.0, 54.0, 67....","[1028.1, 1026.9, 1026.2, 1025.5, 1025.3, 1024....","[1013.8, 1012.8, 1012.1, 1011.5, 1011.4, 1010....","[415.0, 468.0, 491.0, 438.0, 337.0, 220.0, 79....","[617.8, 644.2, 673.1, 675.3, 573.8, 493.9, 331...","[112.0, 135.0, 119.0, 128.0, 126.0, 87.0, 51.0...","[0.0, 33.0, 100.0, 100.0, 0.0, 70.0, 64.0, 100...","[10.3, 10.5, 10.2, 10.0, 9.7, 11.8, 10.0, 7.1,...","[234.0, 239.0, 231.0, 240.0, 239.0, 239.0, 240...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1, 2, 3, 3, 0, 2, 2, 3, 2, 2, 2, 2, 2, 1, 1, ..."
2023-02-17 11:00:00,"[13.0, 14.2, 15.1, 15.4, 15.5, 13.7, 11.0, 8.7...","[57.0, 51.0, 48.0, 50.0, 49.0, 55.0, 68.0, 78....","[1026.4, 1026.0, 1025.2, 1025.1, 1024.4, 1024....","[1012.3, 1012.0, 1011.2, 1011.1, 1010.4, 1010....","[437.0, 482.0, 444.0, 326.0, 215.0, 84.0, 3.0,...","[544.5, 571.3, 674.9, 579.0, 458.4, 357.9, 29....","[178.0, 134.0, 118.0, 127.0, 90.0, 45.0, 3.0, ...","[100.0, 0.0, 86.0, 99.0, 44.0, 37.0, 71.0, 69....","[10.8, 10.7, 9.9, 7.2, 8.9, 11.8, 7.6, 7.6, 9....","[244.0, 237.0, 251.0, 276.0, 249.0, 243.0, 239...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3, 0, 2, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, ..."


In [5]:
# import json 
# def safe_json_loads(x):
#     if isinstance(x, str):
#         try:
#             return np.array(json.loads(x))
#         except (json.JSONDecodeError, TypeError) as e:
#             print(f"Error parsing JSON for value: {x}, error: {e}")
#             return np.nan  # Or handle differently as needed
#     elif isinstance(x, np.ndarray):
#         return x
#     else:
#         return np.nan

# # Apply JSON parsing to the DataFrame columns
# for for_val in [
#     'temperature', 'humidity', 'pressure_msl', 'pressure_surface', 
#     'global_irradiance', 'direct_irradiance', 'diffuse_irradiance', 
#     'cloud_cover', 'wind_speed', 'wind_direction', 'precipitation', 
#     'rain', 'showers', 'snowfall', 'weather_code'
# ]:
#     df[for_val] = df[for_val].map(safe_json_loads)

# # Inspect the DataFrame to ensure proper parsing
# df

In [6]:
# # switch forecasts sequences from string to np array
# import json
# for for_val in [
#     'temperature', 'humidity', 'pressure_msl', 'pressure_surface', 
#     'global_irradiance', 'direct_irradiance', 'diffuse_irradiance', 
#     'cloud_cover', 'wind_speed', 'wind_direction', 'precipitation', 
#     'rain', 'showers', 'snowfall', 'weather_code'
# ]:
#         df[for_val] = df[for_val].map(lambda x: np.array(json.loads(x)), na_action='ignore')
# df

In [7]:
# pick one meteo variable
meas = 'temperature'
df_for = df.loc[:,[meas]]
df_for

Unnamed: 0_level_0,temperature
timestamp,Unnamed: 1_level_1
2022-07-11 12:00:00,"[22.0, 22.6, 23.1, 23.4, 23.2, 22.7, 21.6, 20...."
2022-07-11 13:00:00,"[22.6, 23.1, 23.4, 23.2, 22.7, 21.6, 20.3, 18...."
2022-07-11 14:00:00,"[23.2, 23.6, 23.5, 23.0, 22.1, 20.4, 18.9, 18...."
2022-07-11 15:00:00,"[23.6, 23.5, 23.0, 22.1, 20.4, 18.9, 18.2, 18...."
2022-07-11 16:00:00,"[23.5, 23.0, 22.1, 20.4, 18.9, 18.2, 18.0, 17...."
...,...
2023-02-17 08:00:00,"[4.0, 7.3, 10.3, 12.4, 13.9, 14.9, 15.9, 15.6,..."
2023-02-17 09:00:00,"[7.3, 10.3, 12.4, 13.9, 14.9, 15.9, 15.6, 13.6..."
2023-02-17 10:00:00,"[10.3, 12.4, 13.9, 14.9, 15.9, 15.6, 13.6, 10...."
2023-02-17 11:00:00,"[13.0, 14.2, 15.1, 15.4, 15.5, 13.7, 11.0, 8.7..."


In [8]:
# pick one meteo variable
measP = 'precipitation'
df_forP = df.loc[:,[measP]]
df_forP

Unnamed: 0_level_0,precipitation
timestamp,Unnamed: 1_level_1
2022-07-11 12:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2022-07-11 13:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2022-07-11 14:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2022-07-11 15:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2022-07-11 16:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...
2023-02-17 08:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2023-02-17 09:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2023-02-17 10:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2023-02-17 11:00:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
#unpack forecasts
df_for.dropna(how='any',inplace=True)
df_for_cols_ext = [meas+'_forecast+'+str(i)+'h' for i in range(0,len(df_for[meas].values[0]))]
df_for = pd.DataFrame(index=df_for.index, data=df_for[meas].to_list(), columns=df_for_cols_ext)

df_for = df_for.asfreq('5min')
df_for

Unnamed: 0_level_0,temperature_forecast+0h,temperature_forecast+1h,temperature_forecast+2h,temperature_forecast+3h,temperature_forecast+4h,temperature_forecast+5h,temperature_forecast+6h,temperature_forecast+7h,temperature_forecast+8h,temperature_forecast+9h,...,temperature_forecast+86h,temperature_forecast+87h,temperature_forecast+88h,temperature_forecast+89h,temperature_forecast+90h,temperature_forecast+91h,temperature_forecast+92h,temperature_forecast+93h,temperature_forecast+94h,temperature_forecast+95h
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-11 12:00:00,22.0,22.6,23.1,23.4,23.2,22.7,21.6,20.3,18.6,17.4,...,20.3,20.2,21.1,22.6,24.4,25.3,26.1,27.0,28.0,29.0
2022-07-11 12:05:00,,,,,,,,,,,...,,,,,,,,,,
2022-07-11 12:10:00,,,,,,,,,,,...,,,,,,,,,,
2022-07-11 12:15:00,,,,,,,,,,,...,,,,,,,,,,
2022-07-11 12:20:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-17 11:40:00,,,,,,,,,,,...,,,,,,,,,,
2023-02-17 11:45:00,,,,,,,,,,,...,,,,,,,,,,
2023-02-17 11:50:00,,,,,,,,,,,...,,,,,,,,,,
2023-02-17 11:55:00,,,,,,,,,,,...,,,,,,,,,,


In [10]:
#unpack forecasts
df_forP.dropna(how='any',inplace=True)
df_for_cols_ext = [measP+'_forecast+'+str(i)+'h' for i in range(0,len(df_forP[measP].values[0]))]
df_forP = pd.DataFrame(index=df_forP.index, data=df_forP[measP].to_list(), columns=df_for_cols_ext)
df_forP = df_forP.asfreq('5min')
df_forP

Unnamed: 0_level_0,precipitation_forecast+0h,precipitation_forecast+1h,precipitation_forecast+2h,precipitation_forecast+3h,precipitation_forecast+4h,precipitation_forecast+5h,precipitation_forecast+6h,precipitation_forecast+7h,precipitation_forecast+8h,precipitation_forecast+9h,...,precipitation_forecast+86h,precipitation_forecast+87h,precipitation_forecast+88h,precipitation_forecast+89h,precipitation_forecast+90h,precipitation_forecast+91h,precipitation_forecast+92h,precipitation_forecast+93h,precipitation_forecast+94h,precipitation_forecast+95h
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-11 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-07-11 12:05:00,,,,,,,,,,,...,,,,,,,,,,
2022-07-11 12:10:00,,,,,,,,,,,...,,,,,,,,,,
2022-07-11 12:15:00,,,,,,,,,,,...,,,,,,,,,,
2022-07-11 12:20:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-17 11:40:00,,,,,,,,,,,...,,,,,,,,,,
2023-02-17 11:45:00,,,,,,,,,,,...,,,,,,,,,,
2023-02-17 11:50:00,,,,,,,,,,,...,,,,,,,,,,
2023-02-17 11:55:00,,,,,,,,,,,...,,,,,,,,,,


In [11]:
dfT=df_for.fillna(method='ffill')
dfP = df_forP.interpolate(method='linear', limit_direction='forward', axis=0)



In [None]:
# reduce memory usage
dfT= reduce_mem_usage(dfT)
dfP= reduce_mem_usage(dfP)

In [None]:
dfT

In [None]:
dfP

In [None]:
# # Convert specific columns to lists of floats
# # columns_to_convert1 = [
# #     'temperature', 'humidity', 'pressure',	'cloud_cover',	'wind_speed',
# #         	'wind_direction',	'weather_code'	,'poprecipitation'
# # ]

# columns_to_convert2 = [
#     'temperature', 'humidity', 'pressure',
#     'cloud_cover', 'wind_speed', 'wind_direction', 'poprecipitation',
#      'weather_code'
# ]
# def parse_complex_string(s):
#     try:
#         return [float(x) for x in s.strip('[]').split(',')]
#     except ValueError:
#         return s

# # Function to convert columns to lists of floats
# def convert_columns_to_numeric_lists(df, columns_to_convert):
#     for col in columns_to_convert:
#         df[col] = df[col].astype(str).apply(parse_complex_string)
#     return df

# def convert_columns_to_floats2(df, columns_to_convert):
#     for col in columns_to_convert:
#         print(f"Processing column: {col}")
#         df[col] = df[col].astype(str).apply(parse_complex_string)
#         print(f"Processed column: {col}")
#     return df

# def flatten_columns(df, columns_to_flatten):
#     for col in columns_to_flatten:
#         df[col] = df[col].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
#     return df
# def reduce_cells_for_all_columns(df, columns_to_reduce,n=int):
#     '''
#     Reduce each cell in each column by 96 values from the end
#     '''
#     for cols in columns_to_reduce:
#         df[cols] = df[cols].apply(lambda x: x[:-n])
#     return df
# def get_lengths_of_cells(df, columns):
#     '''
#     Get the lengths of cells in each specified column
#     '''
#     lengths = {col: df[col].apply(lambda x: len(x) if isinstance(x, list) else np.nan) for col in columns}
#     return pd.DataFrame(lengths)
# df = convert_columns_to_numeric_lists(df, columns_to_convert2)
# # df= convert_columns_to_floats2(df, columns_to_convert2)
# # lengths_df = get_lengths_of_cells(df, columns_to_convert2)

# # print("Lengths of cells before reduction:")
# # print(lengths_df)
# # df = convert_columns_to_floats2(df, columns_to_convert1)

# # # Flatten the columns
# # # df= reduce_cells_for_all_columns(df,columns_to_convert2,n=72)
# # # df = flatten_columns(df, columns_to_convert1)
# # df = flatten_columns(df, columns_to_convert2)

In [None]:
# # Function to aggregate list features
# def aggregate_features(df, columns_to_aggregate):
#     for col in columns_to_aggregate:
#         df[f'{col}_mean'] = df[col].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
#         df[f'{col}_std'] = df[col].apply(lambda x: np.std(x) if isinstance(x, list) else 0)
#         df[f'{col}_min'] = df[col].apply(lambda x: np.min(x) if isinstance(x, list) else x)
#         df[f'{col}_max'] = df[col].apply(lambda x: np.max(x) if isinstance(x, list) else x)
#         # Drop original column if not needed
#         df.drop(columns=[col], inplace=True)
#     return df

# # Aggregate the columns
# df = aggregate_features(df, columns_to_convert2)

# print("\nDataFrame with aggregated features:")
# print(df)


In [None]:
# df['timestamp'] = pd.to_datetime(df['timestamp'])
# df.set_index('timestamp', inplace=True)

# # Resample and aggregate the data to every 5 minutes
# df = df.resample('5T').agg({
#     'temperature_mean': 'mean', 'temperature_std': 'mean', 'temperature_min': 'mean', 'temperature_max': 'mean',
#     'humidity_mean': 'mean', 'humidity_std': 'mean', 'humidity_min': 'mean', 'humidity_max': 'mean',
#     'pressure_mean': 'mean', 'pressure_std': 'mean', 'pressure_min': 'mean', 'pressure_max': 'mean',
#     'cloud_cover_mean': 'mean', 'cloud_cover_std': 'mean', 'cloud_cover_min': 'mean', 'cloud_cover_max': 'mean',
#     'wind_speed_mean': 'mean', 'wind_speed_std': 'mean', 'wind_speed_min': 'mean', 'wind_speed_max': 'mean',
#     'wind_direction_mean': 'mean', 'wind_direction_std': 'mean', 'wind_direction_min': 'mean', 'wind_direction_max': 'mean',
#     'poprecipitation_mean': 'sum', 'poprecipitation_std': 'sum', 'poprecipitation_min': 'sum', 'poprecipitation_max': 'sum',
#     'weather_code_mean': 'sum', 'weather_code_std': 'sum', 'weather_code_min': 'sum', 'weather_code_max': 'sum'
# })

# df.head()
# df.isnull().sum()
# df = df.fillna(method='ffill')
# df.head()

In [None]:
# # Convert timestamp to datetime and set as index
# df['timestamp'] = pd.to_datetime(df['timestamp'])
# df.set_index('timestamp', inplace=True)

# # Fill missing values using forward fill
# # df = df.fillna(method='ffill')

# # # Plot univariate data (temperature)
# # uni_data = df['temperature']
# # uni_data.plot()

# # Resample and aggregate the data
# df = df.resample('10T').agg({
#     'temperature': 'mean',  
#     'humidity': 'mean',     
#     'pressure': 'mean',  
    
#     'cloud_cover': 'mean',
#     'wind_speed': 'mean',
#     'wind_direction': 'mean',
#     'poprecipitation': 'sum',
    
#     'weather_code': 'sum'
# })



Observations:
1) One reading evrry 10 mins (from datatime column time diff for every record )
2) 1day = 6*24 = 144 readings
Task : Forecasting Temperature(in degree ) in future 




In [None]:

from sklearn.preprocessing import StandardScaler
# import pandas as pd

# Assuming 'df' is the DataFrame that contains the temperature forecast data.

# List of column names for temperature forecast features
temperature_cols = [
    'temperature_forecast+0h', 'temperature_forecast+1h', 'temperature_forecast+2h', 
    'temperature_forecast+3h', 'temperature_forecast+4h', 'temperature_forecast+5h', 
    'temperature_forecast+6h', 'temperature_forecast+7h', 'temperature_forecast+8h', 
    'temperature_forecast+9h', 'temperature_forecast+10h', 'temperature_forecast+11h', 
    'temperature_forecast+12h', 'temperature_forecast+13h', 'temperature_forecast+14h', 
    'temperature_forecast+15h', 'temperature_forecast+16h', 'temperature_forecast+17h', 
    'temperature_forecast+18h', 'temperature_forecast+19h', 'temperature_forecast+20h', 
    'temperature_forecast+21h', 'temperature_forecast+22h', 'temperature_forecast+23h', 
    'temperature_forecast+24h', 'temperature_forecast+25h', 'temperature_forecast+26h', 
    'temperature_forecast+27h', 'temperature_forecast+28h', 'temperature_forecast+29h', 
    'temperature_forecast+30h', 'temperature_forecast+31h', 'temperature_forecast+32h', 
    'temperature_forecast+33h', 'temperature_forecast+34h', 'temperature_forecast+35h', 
    'temperature_forecast+36h', 'temperature_forecast+37h', 'temperature_forecast+38h', 
    'temperature_forecast+39h', 'temperature_forecast+40h', 'temperature_forecast+41h', 
    'temperature_forecast+42h', 'temperature_forecast+43h', 'temperature_forecast+44h', 
    'temperature_forecast+45h', 'temperature_forecast+46h', 'temperature_forecast+47h'
]

# Select the temperature forecast columns from the DataFrame
temperature_features = dfT[temperature_cols]

# Convert the DataFrame to numpy array
scaler_temp = StandardScaler()


# Fit and transform the scaler on training data
uni_data_temp = scaler_temp.fit_transform(temperature_features.values)

# Split the data into training and validation sets
train_split = int(len(uni_data_temp) * 0.8)

# Set random seed for reproducibility
tf.random.set_seed(13)

# Check the shape of the data to ensure it is correct
print("Shape of uni_data_temp:", uni_data_temp.shape)
print("Training data shape:", uni_data_temp[:train_split].shape)
print("Validation data shape:", uni_data_temp[train_split:].shape)


# # Standardize data
# uni_data_temp_mean = uni_data_temp[:train_split].mean(axis=0)
# uni_data_temp_std = uni_data_temp[:train_split].std(axis=0)
# uni_data_temp = (uni_data_temp - uni_data_temp_mean) / uni_data_temp_std



In [None]:
# Function to create data for univariate forecasting
def univariate_data(dataset, start_idx, end_idx, history_size, target_size):
    data = []
    labels = []
    start_idx = start_idx + history_size
    if end_idx is None:
        end_idx = len(dataset) - target_size
    for i in range(start_idx, end_idx):
        idxs = range(i - history_size, i)
        data.append(dataset[idxs])
        labels.append(dataset[i + target_size])
    return np.array(data), np.array(labels)

uni_data_history = 20  # Last 20 values
uni_data_future = 0    # Future data

x_train_uni_temp, y_train_uni_temp = univariate_data(uni_data_temp, 0, train_split, uni_data_history, uni_data_future)
x_val_uni_temp, y_val_uni_temp = univariate_data(uni_data_temp, train_split, None, uni_data_history, uni_data_future)

print(x_train_uni_temp.shape)  # (151495, 20, 4)
print(y_train_uni_temp.shape)  # (151495, 4)
print(x_val_uni_temp.shape)    # (37859, 20, 4)
print(y_val_uni_temp.shape)    # (37859, 4)

# Function to create time steps
def create_time_steps(length):
    return list(range(-length, 0))

# Function to plot time series data
def plot_time_series(plot_data, delta, title):
    labels = ["History", "True Future", "Model Predicted"]
    marker = ['.-', 'rx', 'go']
    time_steps = create_time_steps(plot_data[0].shape[0])

    plt.title(title)
    for i, x in enumerate(plot_data):
        if i == 0:
            plt.plot(time_steps, plot_data[i][:, 0], marker[i], label=labels[i])  # Plot only the first feature for simplicity
        else:
            future = time_steps[-1] + 1
            plt.plot([future], plot_data[i], marker[i], markersize=10, label=labels[i])  # Plot single future point
    plt.legend()
    plt.xlabel('Time Steps')
    plt.show()

# Example of plotting the first sample
plot_time_series([x_train_uni_temp[0], y_train_uni_temp[0][0]], 0, 'Sample Example - Temperature')

# Example of plotting another sample
i = 20
plot_time_series([x_train_uni_temp[i], y_train_uni_temp[i][0]], 0, 'Sample Example - Temperature')



In [None]:
# Prepare the univar# Define the columns for precipitation forecasts
precipitation_cols = [
    'precipitation_forecast+0h', 'precipitation_forecast+1h', 'precipitation_forecast+2h',
    'precipitation_forecast+3h', 'precipitation_forecast+4h', 'precipitation_forecast+5h',
    'precipitation_forecast+6h', 'precipitation_forecast+7h', 'precipitation_forecast+8h',
    'precipitation_forecast+9h', 'precipitation_forecast+10h', 'precipitation_forecast+11h',
    'precipitation_forecast+12h', 'precipitation_forecast+13h', 'precipitation_forecast+14h',
    'precipitation_forecast+15h', 'precipitation_forecast+16h', 'precipitation_forecast+17h',
    'precipitation_forecast+18h', 'precipitation_forecast+19h', 'precipitation_forecast+20h',
    'precipitation_forecast+21h', 'precipitation_forecast+22h', 'precipitation_forecast+23h',
    'precipitation_forecast+24h', 'precipitation_forecast+25h', 'precipitation_forecast+26h',
    'precipitation_forecast+27h', 'precipitation_forecast+28h', 'precipitation_forecast+29h',
    'precipitation_forecast+30h', 'precipitation_forecast+31h', 'precipitation_forecast+32h',
    'precipitation_forecast+33h', 'precipitation_forecast+34h', 'precipitation_forecast+35h',
    'precipitation_forecast+36h', 'precipitation_forecast+37h', 'precipitation_forecast+38h',
    'precipitation_forecast+39h', 'precipitation_forecast+40h', 'precipitation_forecast+41h',
    'precipitation_forecast+42h', 'precipitation_forecast+43h', 'precipitation_forecast+44h',
    'precipitation_forecast+45h', 'precipitation_forecast+46h', 'precipitation_forecast+47h'
]

# Extract precipitation forecast features
precipitation_features = dfP[precipitation_cols]

scaler_precip = StandardScaler()
uni_data_precip = scaler_precip.fit_transform(precipitation_features.values)

# Assuming train_split is already defined appropriately
tf.random.set_seed(13)


# # Standardize data
# uni_data_precip_mean = uni_data_precip[:train_split].mean(axis=0)
# uni_data_precip_std = uni_data_precip[:train_split].std(axis=0)
# uni_data_precip = (uni_data_precip - uni_data_precip_mean) / uni_data_precip_std

x_train_uni_precip, y_train_uni_precip = univariate_data(uni_data_precip, 0, train_split, uni_data_history, uni_data_future)
x_val_uni_precip, y_val_uni_precip = univariate_data(uni_data_precip, train_split, None, uni_data_history, uni_data_future)
x_train_uni_precip.shape, y_train_uni_precip.shape
x_val_uni_precip.shape, y_val_uni_precip.shape

plot_time_series([x_train_uni_precip[0], y_train_uni_precip[0][0]], 0, 'Sample Example - Precipitation')
i = 20
plot_time_series([x_train_uni_precip[i], y_train_uni_precip[i][0]], 0, 'Sample Example - Precipitation')


In [None]:
# Prepare tensorflow dataset for univariate temperature model
batch_size = 256
buffer_size = 10000

train_uni_temp = tf.data.Dataset.from_tensor_slices((x_train_uni_temp, y_train_uni_temp))
train_uni_temp = train_uni_temp.cache().shuffle(buffer_size).batch(batch_size).repeat()

val_uni_temp = tf.data.Dataset.from_tensor_slices((x_val_uni_temp, y_val_uni_temp))
val_uni_temp = val_uni_temp.cache().shuffle(buffer_size).batch(batch_size).repeat()

# # Prepare tensorflow dataset for univariate precipitation model
train_uni_precip = tf.data.Dataset.from_tensor_slices((x_train_uni_precip, y_train_uni_precip))
train_uni_precip = train_uni_precip.cache().shuffle(buffer_size).batch(batch_size).repeat()

val_uni_precip = tf.data.Dataset.from_tensor_slices((x_val_uni_precip, y_val_uni_precip))
val_uni_precip = val_uni_precip.cache().shuffle(buffer_size).batch(batch_size).repeat()


In [None]:
# Define LSTM model for temperature
lstm_model_temp = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(32, input_shape=x_train_uni_temp.shape[-2:]),
    tf.keras.layers.Dense(1)
])

# Compile the model with optimizer and loss function
lstm_model_temp.compile(optimizer=tf.keras.optimizers.Adam(),
                        loss='mae')

# Print model summary
lstm_model_temp.summary()

# Train LSTM model for temperature
EVALUATION_INTERVAL = 200
EPOCHS = 10

temp_history = lstm_model_temp.fit(train_uni_temp, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL,
                                   validation_data=val_uni_temp, validation_steps=50)


In [None]:
mae_lstm_single = lstm_model_temp.evaluate(val_uni_temp, steps=100)

In [None]:
# Define LSTM model for precipitation
lstm_model_precip = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(32, input_shape=x_train_uni_precip.shape[-2:]),
    tf.keras.layers.Dense(1)
])
lstm_model_precip.compile(optimizer='adam', loss='mae')
lstm_model_precip.summary()

# Train LSTM model for precipitation
precip_history = lstm_model_precip.fit(train_uni_precip, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL,
                                       validation_data=val_uni_precip, validation_steps=50)


In [None]:
mae_lstm_single = lstm_model_precip.evaluate(val_uni_precip, steps=100)

In [None]:
# Plotting function for training history
def plot_train_history(history, title):
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(len(loss))
    plt.figure()
    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title(title)
    plt.legend()
    plt.show()

plot_train_history(temp_history, 'Temperature Training and Validation Loss')
plot_train_history(precip_history, 'Precipitation Training and Validation Loss')


In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Assuming the variables like train_split, buffer_size, batch_size, create_time_steps are defined appropriately

# Multivariate model using selected features
features = [
    'temperature_mean', 'temperature_std', 'temperature_min', 'temperature_max',
    'humidity_mean', 'humidity_std', 'humidity_min', 'humidity_max',
    'poprecipitation_mean', 'poprecipitation_std', 'poprecipitation_min', 'poprecipitation_max'
]

# Filter the DataFrame to include only the selected features
multi_features = df[features].values

# Normalize the features
multi_data_mean = multi_features[:train_split].mean(axis=0)
multi_data_std = multi_features[:train_split].std(axis=0)
multi_features = (multi_features - multi_data_mean) / multi_data_std

# Function to create multivariate data
def multivariate_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    end_index = end_index if end_index is not None else len(dataset) - target_size
    start_index = start_index + history_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i, step)
        data.append(dataset[indices])

        if single_step:
            labels.append(target[i + target_size])
        else:
            labels.append(target[i:i + target_size])

    return np.array(data), np.array(labels)

history_size = 1440
target_size = 144
STEP = 6

# Create training and validation data
x_train_multi, y_train_multi = multivariate_data(multi_features, multi_features[:, 0], 0, train_split, history_size, target_size, STEP, single_step=True)
x_val_multi, y_val_multi = multivariate_data(multi_features, multi_features[:, 0], train_split, None, history_size, target_size, STEP, single_step=True)

# Create TensorFlow datasets
train_data_multi = tf.data.Dataset.from_tensor_slices((x_train_multi, y_train_multi))
train_data_multi = train_data_multi.cache().shuffle(buffer_size).batch(batch_size).repeat()

val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
val_data_multi = val_data_multi.cache().shuffle(buffer_size).batch(batch_size).repeat()

# Multivariate LSTM model
multi_step_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, return_sequences=True, input_shape=x_train_multi.shape[-2:]),
    tf.keras.layers.LSTM(16, return_sequences=False, activation='relu'),
    tf.keras.layers.Dense(4, activation='relu'),  # Additional dense layer if needed
    tf.keras.layers.Dense(target_size)  # Output layer predicting 144 time steps into the future
])

multi_step_model.compile(optimizer='adam', loss='mae')

# Verify model output shape
for x, y in val_data_multi.take(1):
    print("Model Output Shape:", multi_step_model.predict(x).shape)

# Define training parameters
EVALUATION_INTERVAL = 200
EPOCHS = 10

# Train the model
multi_step_history = multi_step_model.fit(train_data_multi, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL,
                                          validation_data=val_data_multi, validation_steps=50)

# Plot training history
plot_train_history(multi_step_history, 'Multi-Step Training and Validation Loss')

# Plot some predictions
for x, y in val_data_multi.take(5):
    plot_time_series([x[0][:, 0].numpy(), y[0].numpy(), multi_step_model.predict(x)], 0, 'LSTM Multi-Step')


In [None]:
mae_lstm_single = multi_step_model.evaluate(val_uni_multi, steps=100)

Moving Window Average


1.   Given last 20 values of observations(temp) , predict next observation
2.   MWA: predict== AVG(last 20 values)




In [None]:
## utility functions

## funtion to create data for univariate forecasting

def univariate_data(dataset, start_idx , end_idx , history_size, target_size):
  data = []
  labels = []
  start_idx  = start_idx + history_size
  if end_idx is None:
    end_idx = len(dataset)- target_size
  for i in range(start_idx , end_idx):
    idxs = range(i-history_size , i)
    data.append(np.reshape(dataset[idxs] , (history_size, 1))) ### reshape data
    labels.append(dataset[i+target_size])
  return np.array(data), np.array(labels)

uni_data_history = 20   ## last 50 values
uni_data_future = 0     ## future data

x_train_uni , y_train_uni = univariate_data(uni_data , 0 , train_split , uni_data_history , uni_data_future)

x_val_uni , y_val_uni = univariate_data(uni_data , train_split , None ,uni_data_history , uni_data_future)

In [None]:
x_train_uni

In [None]:
print(x_train_uni.shape , y_train_uni.shape)
print(x_val_uni.shape , y_val_uni.shape)

In [None]:
print('Single window of history data' , x_train_uni[0])

print('Target Temperature to predict ' , y_train_uni[0])


In [None]:
### fucntion to create time steps
def create_time_steps(length):
  return list(range(-length,0))

### function to plot time series data

def plot_time_series(plot_data, delta , title):
  labels = ["History" , 'True Future' , 'Model Predcited']
  marker = ['.-' , 'rx' , 'go']
  time_steps = create_time_steps(plot_data[0].shape[0])

  if delta:
    future = delta
  else:
    future = 0
  plt.title(title)
  for i , x in enumerate(plot_data):
    if i :
      plt.plot(future , plot_data[i] , marker[i], markersize = 10 , label = labels[i])
    else:
      plt.plot(time_steps, plot_data[i].flatten(), marker[i], label = labels[i])
  plt.legend()
  plt.xlim([time_steps[0], (future+5) *2])

  plt.xlabel('Time_Step')
  return plt
## function to plot time series data



plot_time_series([x_train_uni[0] , y_train_uni[0]] , 0 , 'Sample Example')

In [None]:
i = 20
plot_time_series([x_train_uni[i], y_train_uni[i]] , 0 , 'Sample Example')

In [None]:
### Moving window average

def MWA(history):
  return np.mean(history)




In [None]:
i = 20
plot_time_series([x_train_uni[i] , y_train_uni[i] , MWA(x_train_uni[i])] , 0 , 'MWA predicted')

Univariate time-series forecasting


*   Only single feature as temperature(historical data)
*   Task:  Given last 20 observations(history) , predict next temperature value 



In [None]:
## prepare tensorflow dataset
batch_size = 256
buffer_size = 10000

train_uni = tf.data.Dataset.from_tensor_slices((x_train_uni , y_train_uni))
train_uni = train_uni.cache().shuffle(buffer_size).batch(batch_size).repeat()

val_uni = tf.data.Dataset.from_tensor_slices((x_val_uni , y_val_uni))
val_uni = val_uni.cache().shuffle(buffer_size).batch(batch_size).repeat()

print(train_uni)
print(val_uni)

In [None]:
## Define LSTM model 

lstm_model = tf.keras.models.Sequential([tf.keras.layers.LSTM(8 , input_shape = x_train_uni.shape[-2:]), 
                                         tf.keras.layers.Dense(1)])

lstm_model.compile(optimizer = 'adam', loss = 'mae')

steps = 200

EPOCHS =10

lstm_model.fit(train_uni , epochs = EPOCHS, steps_per_epoch = steps ,
               validation_data = val_uni, validation_steps = 50)




In [None]:
for i , j in val_uni.take(5):
  plot = plot_time_series([i[0].numpy() , j[0].numpy() , lstm_model.predict(i)[0]] ,0 , 'LSTM UNIVARIATE')
  plot.show()

Multivariate  and Single step Forecasting


*   Task: Given 3 features(temp , pressure , and density) at each time step can we predict the temp in future at single time step




In [None]:
## features 

# features_6 = ['temperature', 'humidity', 'pressure', 'global_irradiance', 'direct_irradiance', 'diffuse_irradiance']
features14 = [
    'temperature', 'humidity', 'pressure',
    'cloud_cover', 'wind_speed', 'wind_direction', 'poprecipitation'
]
features = df[features14]
features.head()



In [None]:
features.isnull().sum()
features=features.fillna(features.mean())

In [None]:
features.plot(subplots=True)

In [None]:
# ### standardize data
dataset = features.values
# dataset = np.array(features)
data_mean = dataset[:train_split].mean(axis =0)

data_std = dataset[:train_split].std(axis = 0)

dataset = (dataset - data_mean)/data_std



In [None]:
# # ### create mutlivariate data

# def multivariate_data(dataset, target, start_idx, end_idx, history_size, target_size, step, single_step=False):
#     data, labels = [], []
#     start_idx += history_size
#     if end_idx is None:
#         end_idx = len(dataset) - target_size
#     for i in range(start_idx, end_idx):
#         indices = range(i-history_size, i, step)
#         data.append(dataset[indices])
#         if single_step:
#             labels.append(target[i+target_size])
#         else:
#             labels.append(target[i:i+target_size])
#     return np.array(data), np.array(labels)
def multivariate_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    # Adjust end_index to avoid out-of-bounds error
    end_index = end_index if end_index is not None else len(dataset) - target_size
    
    # Adjust start_index to accommodate history_size
    start_index = start_index + history_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i, step)
        data.append(dataset[indices])

        if single_step:
            labels.append(target[i+target_size])
        else:
            labels.append(target[i:i+target_size])
    
    return np.array(data), np.array(labels)


In [None]:
# ### generate multivariate data
from sklearn.preprocessing import StandardScaler
# history = 720
# future_target = 72
# STEP = 6

# x_train_ss, y_train_ss = multivariate_data(dataset, dataset[:, 1], 0, train_split, history,
#                                            future_target, STEP, single_step=True)


# x_val_ss , y_val_ss = multivariate_data(dataset , dataset[:,1] , train_split , None , history,
#                                         future_target, STEP, single_step = True)


# print(x_train_ss.shape , y_train_ss.shape)
# Define your parameters
# Define your parameters
history = 720
future_target = 72
STEP = 6
train_split = int(len(dataset) * 0.7)
# scaler = StandardScaler()
# scaler.fit(dataset[:train_split])

# # Transform the entire dataset
# dataset = scaler.transform(dataset)
# Get training data
x_train_ss, y_train_ss = multivariate_data(dataset, dataset[:, 1], 0, train_split, history, future_target, STEP, single_step=True)

# Get validation data
x_val_ss, y_val_ss = multivariate_data(dataset, dataset[:, 1], train_split, None, history, future_target, STEP, single_step=True)

# Check shapes
print(x_train_ss.shape, y_train_ss.shape)
print(x_val_ss.shape, y_val_ss.shape)



In [None]:
x_train_ss

In [None]:
## tensorflow dataset

train_ss = tf.data.Dataset.from_tensor_slices((x_train_ss, y_train_ss))
train_ss = train_ss.cache().shuffle(buffer_size).batch(batch_size).repeat()

val_ss = tf.data.Dataset.from_tensor_slices((x_val_ss, y_val_ss))
val_ss = val_ss.cache().shuffle(buffer_size).batch(batch_size).repeat()

print(train_ss)
print(val_ss)



In [None]:
### Modelling using LSTM
from keras.callbacks import EarlyStopping
callbacks = EarlyStopping(
    patience = 10 , 
    restore_best_weights = True , 
    monitor = 'val_loss'
)
single_step_model = tf.keras.models.Sequential()

single_step_model.add(tf.keras.layers.LSTM(16, return_sequences=True,input_shape = x_train_ss.shape[-2:]))
# single_step_model.add(tf.keras.layers.LSTM(16,return_sequences=False))
# single_step_model.add(tf.keras.layers.Dense(4, activation="relu"))
single_step_model.add(tf.keras.layers.Dense(1))
single_step_model.compile(optimizer = tf.keras.optimizers.Adam(clipvalue=1.0,weight_decay=1e-6), loss = 'mae')
single_step_model.summary()


single_step_model_history = single_step_model.fit(train_ss, epochs = EPOCHS ,
                                                  steps_per_epoch =steps,verbose=1, validation_data = val_ss,
                                                  validation_steps = 50)


In [None]:
## plot train test loss 

def plot_loss(history , title):
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs = range(len(loss))
  plt.figure()
  plt.plot(epochs, loss , 'b' , label = 'Train Loss')
  plt.plot(epochs, val_loss , 'r' , label = 'Validation Loss')
  plt.title(title)
  plt.legend()
  plt.grid()
  plt.show()

plot_loss(single_step_model_history , 'Single Step Training and validation loss')

In [None]:
# plot time series and predicted values

for x, y in val_ss.take(5):
  plot = plot_time_series([x[0][:, 1].numpy(), y[0].numpy(),
                    single_step_model.predict(x)[0]], 12,
                   'Single Step Prediction')
  plot.show()

Multi-variate & multi-step forecasting
-> Generate multiple future values of temperature

In [None]:
future_target = 72 # 72 future values
x_train_multi, y_train_multi = multivariate_data(dataset, dataset[:, 1], 0,
                                                 train_split, history,
                                                 future_target, STEP)
x_val_multi, y_val_multi = multivariate_data(dataset, dataset[:, 1],
                                             train_split, None, history,
                                             future_target, STEP)

print(x_train_multi.shape)
print(y_train_multi.shape)

In [None]:
# TF DATASET

train_data_multi = tf.data.Dataset.from_tensor_slices((x_train_multi, y_train_multi))
train_data_multi = train_data_multi.cache().shuffle(buffer_size).batch(batch_size).repeat()

val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
val_data_multi = val_data_multi.batch(batch_size).repeat()

In [None]:
#plotting function
def multi_step_plot(history, true_future, prediction):
  plt.figure(figsize=(12, 6))
  num_in = create_time_steps(len(history))
  num_out = len(true_future)
  plt.grid()
  plt.plot(num_in, np.array(history[:, 1]), label='History')
  plt.plot(np.arange(num_out)/STEP, np.array(true_future), 'bo',
           label='True Future')
  if prediction.any():
    plt.plot(np.arange(num_out)/STEP, np.array(prediction), 'ro',
             label='Predicted Future')
  plt.legend(loc='upper left')
  plt.show()
  


for x, y in train_data_multi.take(1):
  multi_step_plot(x[0], y[0], np.array([0]))

In [None]:
# multi_step_model = tf.keras.models.Sequential()
# multi_step_model.add(tf.keras.layers.LSTM(16,
#                                           return_sequences=True,
#                                           input_shape=x_train_multi.shape[-2:]))
# multi_step_model.add(tf.keras.layers.LSTM(32,return_sequences=False, activation='relu'))
# multi_step_model.add(tf.keras.layers.Dense(4))
# multi_step_model.add(tf.keras.layers.Dense(72)) # for 72 outputs

# multi_step_model.compile(optimizer=tf.keras.optimizers.SGD(clipvalue=1.0,weight_decay=1e-6), loss='mae')
# multi_step_model.summary()
# multi_step_history = multi_step_model.fit(train_data_multi, epochs=EPOCHS,
#                                           steps_per_epoch=steps,
#                                           validation_data=val_data_multi,
#                                           validation_steps=50,callbacks = [callbacks])



# Define the model
multi_step_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, return_sequences=True, input_shape=x_train_multi.shape[-2:]),
    # tf.keras.layers.Dropout(0.2),  # Dropout layer for regularization
    tf.keras.layers.LSTM(16, return_sequences=False, activation='relu'),
    # tf.keras.layers.Dropout(0.2),  # Dropout layer for regularization
    # tf.keras.layers.LSTM(2, return_sequences=False, activation='relu'),
    tf.keras.layers.Dense(4, activation='relu'),  # Additional dense layer
    tf.keras.layers.Dense(72)  # 72 outputs for future predictions
])

# Compile the model
multi_step_model.compile(optimizer=tf.keras.optimizers.Adam(clipvalue=1.0,weight_decay=1e-6), loss='mae')

# Print the model summary
multi_step_model.summary()

# Define callbacks for early stopping and learning rate scheduling
# 

# Fit the model
enhanced_history = multi_step_model.fit(train_data_multi, 
                                      epochs=EPOCHS,
                                      steps_per_epoch=steps,
                                      validation_data=val_data_multi,
                                      validation_steps=50
                                    )

In [None]:
plot_loss(enhanced_history, 'Multi-Step Training and validation loss')


In [None]:
for x, y in val_data_multi.take(5):
  multi_step_plot(x[0], y[0], multi_step_model.predict(x)[0])

In [None]:
mae_lstm_single = single_step_model.evaluate(val_ss, steps=100)
mae_lstm_multi=multi_step_model.evaluate(val_data_multi, steps=100)

In [None]:
from tensorflow.keras.models import load_model

# Load the model
loaded_model = load_model('/Users/faymajidelhassan/Downloads/Master project /CODE/EDA/Saved_models/Lstm_multi_step_model_measure+precip.h5')
loaded_model2 = load_model('/Users/faymajidelhassan/Downloads/Master project /CODE/EDA/Saved_models/Lstm_single_step_model_measure+precip.h5')

# Optionally, you can verify the model by making predictions
for x, y in val_data_multi.take(5):

    predictions = loaded_model.predict(x)
    multi_step_plot(x[0], y[0], loaded_model.predict(x)[0])


In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanAbsoluteError
import numpy as np
import matplotlib.pyplot as plt

# Define a dictionary with custom objects if needed
custom_objects = {'mae': MeanAbsoluteError()}

# Load the models
loaded_model = load_model('/Users/faymajidelhassan/Downloads/Master project /CODE/EDA/Saved_models/Lstm_multi_step_model_measure+precip.h5', custom_objects=custom_objects)
loaded_model2 = load_model('/Users/faymajidelhassan/Downloads/Master project /CODE/EDA/Saved_models/Lstm_single_step_model_measure+precip.h5', custom_objects=custom_objects)

print("Models loaded successfully")

# Define the multi-step plot function
def multi_step_plot(history, true_future, prediction):
    plt.figure(figsize=(12, 6))
    num_in = list(range(-len(history), 0))
    num_out = list(range(len(true_future)))

    plt.plot(num_in, np.array(history[:, 1]), label='History')
    plt.plot(num_out, np.array(true_future), 'bo-', label='True Future')  # Change 'bo' to 'bo-' to plot line with markers
    plt.plot(num_out, np.array(prediction), 'ro-', label='Predicted Future')  # Change 'ro' to 'ro-' to plot line with markers

    plt.legend(loc='upper left')
    plt.xlabel('Time Steps')
    plt.ylabel('Value')
    plt.title('Multi-Step Forecasting')
    plt.grid(True)
    plt.show()

# Use the loaded model to make predictions and plot them
for x, y in val_data_multi.take(5):
    prediction = loaded_model.predict(x)[0]
    multi_step_plot(x[0], y[0], prediction)


In [None]:
mae_lstm_multi=loaded_model.evaluate(val_data_multi, steps=100)

In [None]:
for x, y in val_ss.take(5):
    prediction = transformer_model.predict(x)
    plot = plot_time_series([x[0].numpy(), y[0].numpy(), prediction[0]], 'Transformer UNIVARIATE')
    plot.show()
