# Natural Gas Price Prediction using RNN

This notebook demonstrates how to build an RNN model in PyTorch to predict the next-day natural gas price based on the previous 10 days of prices. The dataset used here contains daily (and optionally monthly) natural gas prices (in nominal dollars) starting from January 1997 up to the current year. Two sample datasets are available at:

- [DataHub: Daily Natural Gas Prices](https://datahub.io/core/natural-gas#resource-daily)
- [Kaggle: Natural Gas Prices](https://www.kaggle.com/datasets/joebeachcapital/natural-gas-prices)

### How to Use This Notebook with Your Custom Dataset

1. **Dataset Format:** Make sure your CSV file contains a column named `Price` that stores the natural gas price for each day. If the dataset has additional columns (like dates), they will be ignored in this example. 
2. **File Location:** Place your CSV file (e.g., `daily.csv`) in a folder (here assumed as `./data/NaturalGasPrice/`). Update the file path in the notebook if necessary.
3. **Sequence Creation:** The notebook creates time-series sequences where the input `X[i]` is a list of 10 consecutive days’ normalized prices and the target `Y[i]` is the price on day 11.

Once you update the file path if needed, simply run the notebook cells in order. The final cells train the RNN and produce plots of the predictions compared to the original prices.

In [None]:
# Import necessary libraries
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch import nn

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# Load the dataset
df = pd.read_csv("./data/NaturalGasPrice/daily.csv")

# Preprocess the data: Drop NA values
df = df.dropna()

# We assume the CSV has a column named 'Price'
y = df['Price'].values
print(f"Total number of price records: {len(y)}")

# Normalize the prices to range [0, 1]
minm = y.min()
maxm = y.max()
print(f"Min price: {minm}, Max price: {maxm}")
y_norm = (y - minm) / (maxm - minm)

# Set the sequence length (last 10 days as input, predict the 11th day)
sequence_length = 10

# Prepare the sequences
X = []
Y = []
# Here we loop until a fixed number (e.g., 5900) or until the end of the available data
num_samples = min(5900, len(y_norm) - sequence_length - 1)
for i in range(num_samples):
    seq = []
    for j in range(i, i + sequence_length):
        seq.append(y_norm[j])
    X.append(seq)
    # The target is the day following the sequence
    Y.append(y_norm[i + sequence_length])

X = np.array(X)
Y = np.array(Y)
print(f"Total samples created: {len(X)}")

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.10, random_state=42, shuffle=False
)

print(f"Training samples: {len(x_train)}")
print(f"Testing samples: {len(x_test)}")

# Create a custom Dataset for the time-series data
class NGTimeSeries(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.len = x.shape[0]
        
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return self.len

# Create DataLoaders for training (batch size can be adjusted)
train_dataset = NGTimeSeries(x_train, y_train)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=256)

# Create a test dataset (we'll use it for evaluation)
test_dataset = NGTimeSeries(x_test, y_test)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=256)

In [None]:
# Define the RNN Model
class RNNModel(nn.Module):
    def __init__(self):
        super(RNNModel, self).__init__()
        # The input_size is 1 because each time-step is a single normalized price
        self.rnn = nn.RNN(input_size=1, hidden_size=5, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(in_features=5, out_features=1)

    def forward(self, x):
        # x shape: (batch_size, sequence_length, 1)
        output, _ = self.rnn(x)  
        # Use the output from the last time step
        output = output[:, -1, :]
        output = self.fc1(torch.relu(output))
        return output

# Initialize the model and move it to the device
model = RNNModel().to(device)

# Define the loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

epochs = 1500
print(model)

In [None]:
# Training Loop
model.train()
for epoch in range(epochs):
    running_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Reshape inputs to (batch_size, sequence_length, 1)
        inputs = inputs.view(-1, sequence_length, 1).to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs).view(-1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    if epoch % 50 == 0:
        print(f"Epoch [{epoch}/{epochs}], Loss: {running_loss/len(train_loader):.6f}")

print("Training complete.")

In [None]:
# Evaluation on Test Data: Plot predicted vs. actual (normalized values)
model.eval()
all_preds = []
all_targets = []
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs = inputs.view(-1, sequence_length, 1).to(device)
        outputs = model(inputs).view(-1)
        all_preds.extend(outputs.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

plt.figure(figsize=(12,6))
plt.plot(all_preds, label='Predicted (Normalized)')
plt.plot(all_targets, label='Actual (Normalized)')
plt.title('Test Set: Normalized Price Prediction')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Price')
plt.legend()
plt.show()

# Undo normalization to plot prices in original dollars
all_preds_orig = np.array(all_preds) * (maxm - minm) + minm
all_targets_orig = np.array(all_targets) * (maxm - minm) + minm

plt.figure(figsize=(12,6))
plt.plot(all_preds_orig, label='Predicted Price')
plt.plot(all_targets_orig, label='Actual Price')
plt.title('Test Set: Natural Gas Price Prediction (Original Scale)')
plt.xlabel('Sample Index')
plt.ylabel('Price (Nominal Dollars)')
plt.legend()
plt.show()