# Recurrent Neural Networks
You should build an end-to-end machine learning pipeline using a recurrent neural network model. In particular, you should do the following:
- Load the `jena climate` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the [keras repository](https://keras.io/examples/timeseries/timeseries_weather_forecasting/).
- Split the dataset into training, validation, and test sets. Note that you cannot split time series using [Scikit-Learn](https://keras.io/examples/timeseries/timeseries_weather_forecasting/).
- Build an end-to-end machine learning pipeline, including a [recurrent neural network](https://keras.io/examples/timeseries/timeseries_weather_forecasting/) model.
- Optimize your pipeline by validating your design decisions.
- Test the best pipeline on the test set and report various [evaluation metrics](https://scikit-learn.org/0.15/modules/model_evaluation.html).  
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [1]:
import pandas as pd

# Load dataset
data_path = '/content/jena_climate_2009_2016.csv'
df = pd.read_csv(data_path)
print(df.head())


             Date Time  p (mbar)  T (degC)  Tpot (K)  Tdew (degC)  rh (%)  \
0  01.01.2009 00:10:00    996.52     -8.02    265.40        -8.90    93.3   
1  01.01.2009 00:20:00    996.57     -8.41    265.01        -9.28    93.4   
2  01.01.2009 00:30:00    996.53     -8.51    264.91        -9.31    93.9   
3  01.01.2009 00:40:00    996.51     -8.31    265.12        -9.07    94.2   
4  01.01.2009 00:50:00    996.51     -8.27    265.15        -9.04    94.1   

   VPmax (mbar)  VPact (mbar)  VPdef (mbar)  sh (g/kg)  H2OC (mmol/mol)  \
0          3.33          3.11          0.22       1.94             3.12   
1          3.23          3.02          0.21       1.89             3.03   
2          3.21          3.01          0.20       1.88             3.02   
3          3.26          3.07          0.19       1.92             3.08   
4          3.27          3.08          0.19       1.92             3.09   

   rho (g/m**3)  wv (m/s)  max. wv (m/s)  wd (deg)  
0       1307.75      1.03        

In [2]:
num_samples = len(df)
train_split = int(num_samples * 0.7)
val_split = int(num_samples * 0.9)

train_data = df[:train_split]
val_data = df[train_split:val_split]
test_data = df[val_split:]


In [3]:
print(df.columns)


Index(['Date Time', 'p (mbar)', 'T (degC)', 'Tpot (K)', 'Tdew (degC)',
       'rh (%)', 'VPmax (mbar)', 'VPact (mbar)', 'VPdef (mbar)', 'sh (g/kg)',
       'H2OC (mmol/mol)', 'rho (g/m**3)', 'wv (m/s)', 'max. wv (m/s)',
       'wd (deg)'],
      dtype='object')


In [4]:
# Assuming first column is timestamp, exclude it
numeric_train_data = train_data.iloc[:, 1:]  # select all columns except the first

mean = numeric_train_data.mean()
std = numeric_train_data.std()

# Normalize only numeric columns
train_data.iloc[:, 1:] = (train_data.iloc[:, 1:] - mean) / std
val_data.iloc[:, 1:] = (val_data.iloc[:, 1:] - mean) / std
test_data.iloc[:, 1:] = (test_data.iloc[:, 1:] - mean) / std


In [5]:
df['Date Time'] = pd.to_datetime(df['Date Time'], format='%d.%m.%Y %H:%M:%S')


In [6]:
# Assuming timestamp is first column, exclude it from normalization
mean = train_data.iloc[:, 1:].mean()
std = train_data.iloc[:, 1:].std()

train_data.iloc[:, 1:] = (train_data.iloc[:, 1:] - mean) / std
val_data.iloc[:, 1:] = (val_data.iloc[:, 1:] - mean) / std
test_data.iloc[:, 1:] = (test_data.iloc[:, 1:] - mean) / std


In [7]:
import tensorflow as tf

def create_tf_dataset(data, lookback, delay, step, batch_size=128):
    data = data.values[:, 1:]  # exclude timestamp column
    targets = data[:, 0]  # e.g. first numeric column (temperature)

    # Inputs: sliding windows of shape (lookback/step, features)
    inputs = tf.keras.utils.timeseries_dataset_from_array(
        data=data[:-delay],
        targets=targets[lookback + delay:],
        sequence_length=lookback // step,
        sequence_stride=1,
        sampling_rate=step,
        batch_size=batch_size
    )
    return inputs


In [8]:
print(train_data.dtypes)          # Confirm numeric columns
print(train_data.isnull().sum())  # Check for missing values


Date Time           object
p (mbar)           float64
T (degC)           float64
Tpot (K)           float64
Tdew (degC)        float64
rh (%)             float64
VPmax (mbar)       float64
VPact (mbar)       float64
VPdef (mbar)       float64
sh (g/kg)          float64
H2OC (mmol/mol)    float64
rho (g/m**3)       float64
wv (m/s)           float64
max. wv (m/s)      float64
wd (deg)           float64
dtype: object
Date Time          0
p (mbar)           0
T (degC)           0
Tpot (K)           0
Tdew (degC)        0
rh (%)             0
VPmax (mbar)       0
VPact (mbar)       0
VPdef (mbar)       0
sh (g/kg)          0
H2OC (mmol/mol)    0
rho (g/m**3)       0
wv (m/s)           0
max. wv (m/s)      0
wd (deg)           0
dtype: int64


In [9]:
train_data = train_data.fillna(method='ffill').fillna(method='bfill')
val_data = val_data.fillna(method='ffill').fillna(method='bfill')
test_data = test_data.fillna(method='ffill').fillna(method='bfill')


  train_data = train_data.fillna(method='ffill').fillna(method='bfill')
  val_data = val_data.fillna(method='ffill').fillna(method='bfill')
  test_data = test_data.fillna(method='ffill').fillna(method='bfill')


In [10]:
def create_tf_dataset(data, lookback, delay, step, batch_size=128):
    # Exclude timestamp column
    data_array = data.iloc[:, 1:].to_numpy(dtype=np.float32)
    targets = data_array[:, 0]  # example target: first numeric column

    dataset = tf.keras.utils.timeseries_dataset_from_array(
        data=data_array[:-delay],
        targets=targets[lookback + delay:],
        sequence_length=lookback // step,
        sequence_stride=1,
        sampling_rate=step,
        batch_size=batch_size
    )
    return dataset


In [11]:
# Drop timestamp column for normalization and dataset creation
# (Assuming it is first column)

# Fill missing values
train_data = train_data.fillna(method='ffill').fillna(method='bfill')
val_data = val_data.fillna(method='ffill').fillna(method='bfill')
test_data = test_data.fillna(method='ffill').fillna(method='bfill')

# Normalize numeric columns only (exclude timestamp)
mean = train_data.iloc[:, 1:].mean()
std = train_data.iloc[:, 1:].std()

train_data.iloc[:, 1:] = (train_data.iloc[:, 1:] - mean) / std
val_data.iloc[:, 1:] = (val_data.iloc[:, 1:] - mean) / std
test_data.iloc[:, 1:] = (test_data.iloc[:, 1:] - mean) / std


  train_data = train_data.fillna(method='ffill').fillna(method='bfill')
  val_data = val_data.fillna(method='ffill').fillna(method='bfill')
  test_data = test_data.fillna(method='ffill').fillna(method='bfill')


In [12]:
import numpy as np

batch_size = 128
lookback = 720
delay = 72
step = 6

train_dataset = create_tf_dataset(train_data, lookback, delay, step, batch_size)
val_dataset = create_tf_dataset(val_data, lookback, delay, step, batch_size)
test_dataset = create_tf_dataset(test_data, lookback, delay, step, batch_size)


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Assume lookback, step, etc. defined as before

# input_shape = (sequence_length, number_of_features)
input_shape = (lookback // step, train_data.shape[1] - 1)  # exclude timestamp col

model = Sequential([
    LSTM(32, input_shape=input_shape),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer='rmsprop', loss='mae', metrics=['mae'])

# Train using the tf.data.Dataset objects, not numpy arrays
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset
)

# Evaluate on test dataset
model.evaluate(test_dataset)


Epoch 1/10


  super().__init__(**kwargs)


[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 79ms/step - loss: 0.4585 - mae: 0.4585 - val_loss: 0.3806 - val_mae: 0.3806
Epoch 2/10
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 78ms/step - loss: 0.3631 - mae: 0.3631 - val_loss: 0.3476 - val_mae: 0.3476
Epoch 3/10
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 80ms/step - loss: 0.3431 - mae: 0.3431 - val_loss: 0.3366 - val_mae: 0.3366
Epoch 4/10
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 87ms/step - loss: 0.3326 - mae: 0.3326 - val_loss: 0.3315 - val_mae: 0.3315
Epoch 5/10
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 77ms/step - loss: 0.3269 - mae: 0.3269 - val_loss: 0.3281 - val_mae: 0.3281
Epoch 6/10
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 78ms/step - loss: 0.3216 - mae: 0.3216 - val_loss: 0.3235 - val_mae: 0.3235
Epoch 7/10
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

[0.2547207176685333, 0.2547207176685333]

In [17]:
model.evaluate(test_dataset)

[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 36ms/step - loss: 0.2553 - mae: 0.2553


[0.27107036113739014, 0.27107036113739014]

In [19]:
test_loss, test_mae = model.evaluate(test_dataset)
print(f'Test MAE: {test_mae}')


[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 35ms/step - loss: 0.2553 - mae: 0.2553
Test MAE: 0.27107036113739014


In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print(f"📊 Final Evaluation Metrics:")
print(f"MAE : {mae:.4f}")
print(f"RMSE: {rmse:.4f}")


📊 Final Evaluation Metrics:
MAE : 0.2711
RMSE: 0.3532
