In [None]:
# Importing required libraries
import numpy as np  # For handling numbers and arrays (tables of data)
from math import sqrt  # To calculate square roots
from sklearn.metrics import mean_squared_error  # To measure how far off our predictions are
from matplotlib import pyplot as plt  # For drawing graphs

In [None]:
# Importing the machine learning tools from Keras
from keras.models import Sequential  # Helps us build a model one layer at a time
from keras.layers import Dense, LSTM  # The types of layers we'll use in the model

In [None]:
# Step 1: Load the training, validation, and test data from CSV files
# These files contain the weather forecast inputs and solar energy outputs
# The original code used Windows paths. Here we use relative paths for compatibility with Jupyter
train_data = np.loadtxt("train_NREL_solar_data.csv", delimiter=',')
validate_data = np.loadtxt("validate_NREL_solar_data.csv", delimiter=',')
test_data = np.loadtxt("test_NREL_solar_data.csv", delimiter=',')

In [None]:
# Step 2: Split the data into inputs and outputs
# Inputs = weather conditions (first 9 columns), Output = solar irradiance (last column)
x_tr, t_tr = train_data[:, 0:9], train_data[:, -1]  # Training data
x_va, t_va = validate_data[:, 0:9], validate_data[:, -1]  # Validation data
x_te, t_te = test_data[:, 0:9], test_data[:, -1]  # Test data

In [None]:
# Step 3: Calculate the number of full days in each dataset
# Each "day" has 11 time steps (e.g., 11 hourly records)
Ndays_tr = x_tr.shape[0] // 11
Ndays_va = x_va.shape[0] // 11
Ndays_te = x_te.shape[0] // 11

In [None]:
# Step 4: Reshape the data so the model can understand it
# LSTM models expect input in the shape: [samples, time steps, features]
train_x = x_tr.reshape(Ndays_tr, 11, 9)
train_t = t_tr.reshape(Ndays_tr, 11, 1)

validate_x = x_va.reshape(Ndays_va, 11, 9)
validate_t = t_va.reshape(Ndays_va, 11, 1)

test_x = x_te.reshape(Ndays_te, 11, 9)
test_t = t_te.reshape(Ndays_te, 11, 1)

In [None]:
# Step 5: Build the LSTM model
model = Sequential()  # A basic model where we add one layer at a time

# Add an LSTM layer with 50 units
# This layer learns patterns in sequences over time
model.add(LSTM(50, input_shape=(11, 9), return_sequences=True))

# Add a Dense output layer that gives one prediction per time step
# Linear activation means it outputs a real number (not a category)
model.add(Dense(1, activation='linear'))

# Compile the model using Mean Squared Error (MSE) as the loss function
# The model will try to reduce this error during training
model.compile(loss='mse', optimizer='adam')

In [None]:
# Step 6: Train the model
# The model looks at the training data 100 times (100 epochs)
# It also checks how it performs on the validation data during training
history = model.fit(train_x, train_t, epochs=100, batch_size=50, validation_data=(validate_x, validate_t))

In [None]:
# Step 7: Plot the training loss to see how well the model learned
plt.plot(history.history['loss'], label='Training Loss')
plt.legend()
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss (Mean Squared Error)")
plt.show()

In [None]:
# Step 8: Use the trained model to predict solar irradiance on the test data
yhat = model.predict(test_x)

In [None]:
# Step 9: Reshape the predictions into a flat list for comparison
y_te = yhat.reshape(Ndays_te * 11,)

In [None]:
# Step 10: Calculate RMSE (Root Mean Squared Error) to measure accuracy
# The lower the RMSE, the better the model's performance
rmse2 = mean_squared_error(y_te, t_te) * Ndays_te * 11
rmse = sqrt(rmse2 / 4026) * 1087.4396 / 2  # Scaling as used in the original study
print('Test RMSE: %.3f' % rmse)

In [None]:
# STEP 11 - Load the CSV files and check for missing values using NumPy

import numpy as np  # Make sure this is already imported earlier

# Load the CSVs (assuming no headers and same structure)
train_data = np.loadtxt('train_NREL_solar_data.csv', delimiter=',')
validate_data = np.loadtxt('validate_NREL_solar_data.csv', delimiter=',')
test_data = np.loadtxt('test_NREL_solar_data.csv', delimiter=',')

# Preview the training data
print("Preview of training data (first 5 rows):")
print(train_data[:5])

# Check for missing values in training data
print("\nMissing values in training data (per column):")
print(np.isnan(train_data).sum(axis=0))

In [None]:
import numpy as np

# STEP 12 - Reshape the data into sequences for CNN-LSTM
def create_sequences(data, time_steps):
    X = []
    y = []
    for i in range(len(data) - time_steps):
        X.append(data[i:i+time_steps, :])     # Input window
        y.append(data[i+time_steps, -1])      # Target value
    return np.array(X), np.array(y)

# Set how many time steps to look back
time_steps = 10

# Apply to the training dataset
X_train, y_train = create_sequences(train_data, time_steps)

# Check the shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense

# STEP 12.1 - Define the CNN-LSTM model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(50, activation='relu'))
model.add(Dense(1))

# STEP 12.2 - Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# STEP 12.3 - Train the model (use fewer epochs if you're testing)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)


In [None]:
import matplotlib.pyplot as plt

# STEP 13 - Plot the training loss over epochs
plt.plot(history.history['loss'], marker='o')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

In [None]:
# STEP 14: Load and scale the data using NumPy only
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load CSVs using NumPy (assuming no headers)
train_data = np.loadtxt('train_NREL_solar_data.csv', delimiter=',')
validate_data = np.loadtxt('validate_NREL_solar_data.csv', delimiter=',')
test_data = np.loadtxt('test_NREL_solar_data.csv', delimiter=',')

# Scale all datasets using MinMaxScaler
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
validate_data = scaler.transform(validate_data)
test_data = scaler.transform(test_data)

# Preview the first 5 rows of the scaled training data
print("Training data after scaling (first 5 rows):")
print(train_data[:5])

In [None]:
# STEP 15: Reshape into sequences
import numpy as np

def create_sequences(data, time_steps):
    X = []
    y = []
    for i in range(len(data) - time_steps):
        X.append(data[i:i+time_steps])
        y.append(data[i+time_steps, -1])  # # Save the trained CNN-LSTM model to disk for future reuse
    
    return np.array(X), np.array(y)

# Set time step size
time_steps = 10

# Create sequences
X_train, y_train = create_sequences(train_data, time_steps)
X_val, y_val = create_sequences(validate_data, time_steps)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:
# STEP 16: Build CNN-LSTM Model
from tensorflow.keras.callbacks import EarlyStopping

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100, batch_size=32,
                    callbacks=[early_stop],
                    verbose=1)

In [None]:
import matplotlib.pyplot as plt

# STEP 17: Visualise Model Learning Over Time

plt.figure(figsize=(10,6))
plt.plot(history.history['loss'], label='Training Loss', marker='o')
plt.plot(history.history['val_loss'], label='Validation Loss', marker='x')
plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Predict on validation set
predictions = model.predict(X_val)

# Step 18: Plot first 100 predictions vs actual values
plt.figure(figsize=(12,5))
plt.plot(predictions[:100], label='Predicted', linestyle='--')
plt.plot(y_val[:100], label='Actual', alpha=0.7)
plt.title('Predicted vs Actual Solar Irradiance on Validation Set (First 100 Samples)')
plt.xlabel('Sample Index')
plt.ylabel('Scaled Irradiance')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# STEP 19: Save the trained CNN-LSTM model to disk for future reuse
model.save('cnn_lstm_solar_model.h5')

In [None]:
# STEP 20: Reuse your sequence function and prepare test set sequences using NumPy only

# Sequence creation function using NumPy slicing
def create_sequences(data, time_steps):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i+time_steps, :])       # All features in the window
        y.append(data[i+time_steps, -1])        # Target = irradiance column at next step
    return np.array(X), np.array(y)

# Load and scale the test data using NumPy
test_data_raw = np.loadtxt('test_NREL_solar_data.csv', delimiter=',')
scaler = MinMaxScaler()
test_data = scaler.fit_transform(test_data_raw)  # Fit just for testing

# Prepare test sequences for model input
time_steps = 10
X_test, y_test = create_sequences(test_data, time_steps)

In [None]:
# STEP 21: Load the trained model (if not in memory)
from tensorflow.keras.models import load_model
model = load_model('cnn_lstm_solar_model.h5')

In [None]:
# STEP 22: Predict on test set
predictions_test = model.predict(X_test)

In [None]:
# STEP 23: Evaluate model using RMSE
rmse_test = sqrt(mean_squared_error(y_test, predictions_test))
print(f"Test RMSE: {rmse_test:.4f}")

In [None]:
# STEP 24: Calculate and print RMSE on test set
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(y_test, predictions_test))
print(f"Test RMSE: {rmse:.4f}")

In [None]:
# STEP 25: Plot predictions vs actual values
plt.figure(figsize=(12,5))
plt.plot(predictions_test[:100], label='Predicted', linestyle='--')
plt.plot(y_test[:1›00], label='Actual', alpha=0.7)
plt.title('Predicted vs Actual Solar Irradiance on Test Set (First 100 Samples)')
plt.xlabel('Sample Index')
plt.ylabel('Scaled Irradiance')
plt.legend()
plt.grid(True)
plt.show()