In [None]:
! pip install tensorflow
! pip install pandas
! pip install scikit-learn
! pip install opencv-python

In [None]:
# preprocessing/data_preprocessing.py

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import joblib

def preprocess_meteorological_data(file_path):
    # Load data
    met_data = pd.read_csv(file_path)

    # Combine DATE and MST into a single datetime column
    # met_data['Timestamp'] = pd.to_datetime(met_data['DATE'] + ' ' + met_data['MST'])
    # Combine DATE and MST into a single datetime column
    met_data['Timestamp'] = pd.to_datetime(met_data['datetime'].astype(str), format='%Y%m%d%H%M%S')

    # Sort data by Timestamp
    met_data.sort_values('Timestamp', inplace=True)

    # Handle missing values in input features using KNN imputation
    input_features = [
        'Tower Dry Bulb Temp [deg C]', 'Tower RH [%]', 'Station Pressure [mBar]',
        'Avg Wind Speed @ 6ft [m/s]', 'Avg Wind Direction @ 6ft [deg from N]'
    ]

    # Initialize KNN imputer
    imputer = KNNImputer(n_neighbors=5)

    # Fit and transform the input features
    met_data_imputed = imputer.fit_transform(met_data[input_features])

    # Update the DataFrame with imputed values
    met_data[input_features] = met_data_imputed

    # Handle missing values in the target variable separately
    target_variable = 'Global CMP22 (vent/cor) [W/m^2]'

    # Optionally interpolate missing target values
    # met_data[target_variable].interpolate(method='time', inplace=True)
    
    # Option 2: Drop rows with missing target values (uncomment if preferred)
    met_data.dropna(subset=[target_variable], inplace=True)

    # Rename columns for simplicity
    met_data.rename(columns={
        'Tower Dry Bulb Temp [deg C]': 'Temperature',
        'Tower RH [%]': 'Humidity',
        'Station Pressure [mBar]': 'Pressure',
        'Avg Wind Speed @ 6ft [m/s]': 'Wind Speed',
        'Avg Wind Direction @ 6ft [deg from N]': 'Wind Direction',
        'Global CMP22 (vent/cor) [W/m^2]': 'Irradiance'
    }, inplace=True)

    # Feature scaling for input features
    scaler = MinMaxScaler()
    met_data[['Temperature', 'Humidity', 'Pressure', 'Wind Speed']] = scaler.fit_transform(
        met_data[['Temperature', 'Humidity', 'Pressure', 'Wind Speed']]
    )
    joblib.dump(scaler, 'scaler_y.pkl')

    # Wind Direction encoding (convert degrees to sine and cosine components)
    met_data['Wind Dir Sin'] = np.sin(np.deg2rad(met_data['Wind Direction']))
    met_data['Wind Dir Cos'] = np.cos(np.deg2rad(met_data['Wind Direction']))
    met_data.drop('Wind Direction', axis=1, inplace=True)

    # Temporal features
    met_data['Hour'] = met_data['Timestamp'].dt.hour / 23.0  # Normalize Hour
    met_data['DayOfYear'] = met_data['Timestamp'].dt.dayofyear / 365.0  # Normalize DayOfYear

    # Prepare target variables (future irradiance)
    target = 'Irradiance'
    for minutes in [5, 15, 30, 60]:
        met_data[f'Irradiance_{minutes}min_ahead'] = met_data[target].shift(-minutes)

    # **Removed the line that drops rows with NaN values after shifting**
    # We will handle dropping NaN values after merging with images
    # met_data.dropna(inplace=True)

    # Reset index and return the processed DataFrame
    return met_data.reset_index(drop=True)

In [None]:
# preprocessing/image_preprocessing.py

import cv2
import numpy as np
import glob
import os
import pandas as pd

def preprocess_images(image_folder):
    image_paths = sorted(glob.glob(os.path.join(image_folder, '*.jpg')))
    images = []
    image_timestamps = []

    for path in image_paths:
        # Extract timestamp from image filename
        # Assuming filename format: YYYYMMDDHHMMSS.jpg
        filename = os.path.basename(path)
        timestamp_str = filename.replace('.jpg', '')
        timestamp = pd.to_datetime(timestamp_str, format='%Y%m%d%H%M%S')

        img = cv2.imread(path)
        if img is None:
            continue  # Skip if the image is not readable
        # Iterates over each image path, reads the image using OpenCV, and resizes it to 128x128 pixels
	    # Normalizes pixel values to the range [0, 1] by dividing by 255
        img = cv2.resize(img, (128, 128))
        img = img / 255.0  # Normalize pixel values
        images.append(img)
        image_timestamps.append(timestamp)

    return images, image_timestamps

In [None]:
met_data = preprocess_meteorological_data('weather_data.csv')
met_data

In [None]:
images, image_timestamps = preprocess_images('pics')
len(image_timestamps)

In [54]:
# Step 2: Align Data with Images
def align_data_with_images(met_data, images, image_timestamps):
    """
    Aligns meteorological data with corresponding images based on timestamps.

    Parameters:
    - met_data: pandas DataFrame containing meteorological data.
    - images: list or numpy array of preprocessed images.
    - image_timestamps: list or pandas Series of image timestamps.

    Returns:
    - merged_data: pandas DataFrame containing aligned meteorological data and images.
    """
    # Create a DataFrame for image timestamps
    image_df = pd.DataFrame({'Timestamp': image_timestamps, 'Image': images})
    image_df['Timestamp'] = pd.to_datetime(image_df['Timestamp'])
    
    # Ensure meteorological data has datetime objects
    met_data['Timestamp'] = pd.to_datetime(met_data['Timestamp'])
    
    # Merge meteorological data with images using an inner join to keep only matching timestamps
    merged_data = pd.merge(met_data, image_df, on='Timestamp', how='inner')
    
    # Prepare target variables (future irradiance)
    target = 'Irradiance'
    for minutes in [5, 15, 30, 60]:
        merged_data[f'Irradiance_{minutes}min_ahead'] = merged_data[target].shift(-minutes)
    
    # Drop rows with any remaining missing values (due to shifting)
    merged_data.dropna(inplace=True)
    
    # Reset index after dropping rows
    merged_data.reset_index(drop=True, inplace=True)
    
    return merged_data

merged_data = align_data_with_images(met_data, images, image_timestamps)

# Step 3: Split Data into Training and Testing Sets BEFORE Sequence Creation
split_ratio = 0.8  # 80% for training, 20% for testing
split_index = int(len(merged_data) * split_ratio)

# Perform the split
train_data = merged_data.iloc[:split_index]
test_data = merged_data.iloc[split_index:]

print(f"Training data contains {len(train_data)} records.")
print(f"Testing data contains {len(test_data)} records.")


Training data contains 499 records.
Testing data contains 125 records.


In [None]:
# training/train_model.py

import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from models.hybrid import create_hybrid_model  # Ensure correct import path
import pandas as pd

def train_hybrid_model(train_data, sequence_length=60):
    """
    Trains the hybrid CNN-LSTM model on the provided training data.

    Parameters:
    - train_data: pandas DataFrame containing training meteorological data and images.
    - sequence_length: Number of past minutes to consider for each sequence.

    Returns:
    - model: Trained Keras model.
    - history: Training history object.
    """
    
    # Define feature columns
    features = [
        'Temperature', 'Humidity', 'Pressure', 'Wind Speed',
        'Wind Dir Sin', 'Wind Dir Cos', 'Hour', 'DayOfYear'
    ]  # Exclude direct current irradiance as a feature

    # Extract features and targets
    X_num_train, X_img_train, y_train = create_sequences(train_data, sequence_length, features)

    print(f"Created {len(X_num_train)} training sequences.")

    # Create and compile the model
    num_features = X_num_train.shape[2]
    model = create_hybrid_model(sequence_length, num_features)

    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

    # Define callbacks
    early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(factor=0.5, patience=3)

    # Train the model with dictionary inputs
    history = model.fit(
        {'Image_Input': X_img_train, 'LSTM_Input': X_num_train},
        y_train,
        validation_split=0.1,  # Further split training data for validation
        epochs=50,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr]
    )

    return model, history

def create_sequences(data, seq_length, features):
    """
    Creates input sequences and corresponding targets from the data.

    Parameters:
    - data: pandas DataFrame containing the data.
    - seq_length: Length of each input sequence.
    - features: List of feature column names.

    Returns:
    - X_num_seq: Numpy array of numerical feature sequences.
    - X_img_seq: Numpy array of image sequences.
    - y_seq: Numpy array of target irradiance values.
    """
    X_num_seq, X_img_seq, y_seq = [], [], []
    for i in range(len(data) - seq_length):
        # Extract numerical features for the sequence
        X_num_seq.append(data[features].values[i:i+seq_length])
        
        # Extract the corresponding image at the last timestamp of the sequence
        X_img_seq.append(data['Image'].values[i+seq_length-1])
        
        # Extract the target irradiance values at the end of the sequence
        y_seq.append(data[[f'Irradiance_{minutes}min_ahead' for minutes in [5, 15, 30, 60]]].values[i+seq_length-1])
    
    return np.array(X_num_seq), np.array(X_img_seq), np.array(y_seq)



In [56]:
# Step 4: Train the Model
model, history = train_hybrid_model(train_data, sequence_length=60)

# Save the model
model.save('trainedModels/test.keras')

Created 439 training sequences.
Epoch 1/50




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 154ms/step - loss: 177137.6250 - mae: 391.5333 - val_loss: 143341.7969 - val_mae: 355.1158 - learning_rate: 0.0010
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 132ms/step - loss: 48604.2656 - mae: 177.2811 - val_loss: 100337.8203 - val_mae: 305.9944 - learning_rate: 0.0010
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 130ms/step - loss: 25964.6992 - mae: 140.5359 - val_loss: 90704.2500 - val_mae: 299.3154 - learning_rate: 0.0010
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 142ms/step - loss: 19079.4746 - mae: 121.5322 - val_loss: 96893.2031 - val_mae: 310.8622 - learning_rate: 0.0010
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 135ms/step - loss: 15244.1357 - mae: 108.4160 - val_loss: 52859.3750 - val_mae: 207.5781 - learning_rate: 0.0010
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [52]:
# evaluation/evaluate_model.py

import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import os
from datetime import datetime

def evaluate_model(model, test_data, sequence_length=60):
    """
    Evaluates the regression model on the provided test_data.
    Saves evaluation plots in a uniquely named subfolder within 'evaluation_plots'.
    
    Parameters:
    - model: Trained Keras model.
    - test_data: pandas DataFrame containing testing meteorological data and images.
    - sequence_length: Number of past minutes to consider for each sequence.
    """
    
    # -----------------------------
    # 1. Setup Directory for Saving Plots
    # -----------------------------
    
    # Define the base directory for evaluation plots
    base_dir = 'evaluation_plots'
    
    # Create the base directory if it doesn't exist
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
        print(f"Created base directory for evaluation plots at '{base_dir}'.")
    
    # Generate a unique subfolder name using the current timestamp
    run_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    run_folder = os.path.join(base_dir, f'run_{run_timestamp}')
    
    # Create the subfolder
    os.makedirs(run_folder, exist_ok=True)
    print(f"Created run-specific directory at '{run_folder}'.")
    
    # -----------------------------
    # 2. Prepare Data for Evaluation
    # -----------------------------
    
    # Define feature columns
    features = [
        'Temperature', 'Humidity', 'Pressure', 'Wind Speed',
        'Wind Dir Sin', 'Wind Dir Cos', 'Hour', 'DayOfYear'
    ]  # Exclude direct current irradiance as a feature

    # Extract features and targets
    X_num = test_data[features].values
    y = test_data[[f'Irradiance_{minutes}min_ahead' for minutes in [5, 15, 30, 60]]].values
    X_img = np.array(test_data['Image'].tolist())

    # -----------------------------
    # 3. Create Sequences
    # -----------------------------
    
    def create_sequences(X_num, X_img, y, seq_length):
        """
        Creates input sequences for the model.
        
        Parameters:
        - X_num: Numpy array of numerical features.
        - X_img: Numpy array of images.
        - y: Numpy array of target variables.
        - seq_length: Length of the input sequences.
        
        Returns:
        - Tuple of Numpy arrays: (X_num_seq, X_img_seq, y_seq)
        """
        X_num_seq, X_img_seq, y_seq = [], [], []
        for i in range(len(X_num) - seq_length):
            X_num_seq.append(X_num[i:i+seq_length])
            X_img_seq.append(X_img[i+seq_length-1])  # Use image at the last timestamp
            y_seq.append(y[i+seq_length-1])
        return np.array(X_num_seq), np.array(X_img_seq), np.array(y_seq)

    # Generate sequences from test data
    X_num_seq, X_img_seq, y_seq = create_sequences(X_num, X_img, y, sequence_length)
    print(f"Created {len(X_num_seq)} sequences for evaluation.")

    # -----------------------------
    # 4. No Further Splitting Needed
    # -----------------------------
    
    # Since test_data is already separate, no need to split again
    X_num_test = X_num_seq
    X_img_test = X_img_seq
    y_test = y_seq
    
    print(f"Evaluation split: {len(X_num_test)} samples.")
    
    # -----------------------------
    # 5. Make Predictions
    # -----------------------------
    
    # Generate predictions using the trained model
    y_pred = model.predict({'Image_Input': X_img_test, 'LSTM_Input': X_num_test})
    print("Generated predictions for the test set.")

    # -----------------------------
    # 6. Calculate and Save Metrics and Plots
    # -----------------------------
    
    horizons = [5, 15, 30, 60]  # Prediction horizons in minutes
    
    # Initialize a text file to save metrics
    metrics_file = os.path.join(run_folder, 'metrics.txt')
    with open(metrics_file, 'w') as f:
        f.write("Evaluation Metrics:\n")
        f.write("===================\n\n")
    
    for i, minutes in enumerate(horizons):
        # Calculate metrics for each horizon
        rmse = np.sqrt(mean_squared_error(y_test[:, i], y_pred[:, i]))
        mae = mean_absolute_error(y_test[:, i], y_pred[:, i])
        r2 = r2_score(y_test[:, i], y_pred[:, i])
        metric_str = f"{minutes}-Minute Ahead Prediction - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}"
        print(metric_str)
        
        # Append metrics to the text file
        with open(metrics_file, 'a') as f:
            f.write(metric_str + "\n")
        
        # Plot Actual vs Predicted
        plt.figure(figsize=(10, 4))
        plt.plot(y_test[:, i], label='Actual', alpha=0.7)
        plt.plot(y_pred[:, i], label='Predicted', alpha=0.7)
        plt.title(f'{minutes}-Minute Ahead Prediction')
        plt.xlabel('Samples')
        plt.ylabel('Irradiance (W/m²)')
        plt.legend()
        plt.tight_layout()
        
        # Save the plot
        plot_filename = f'{minutes}_min_ahead_prediction.png'
        plot_path = os.path.join(run_folder, plot_filename)
        plt.savefig(plot_path)
        plt.close()
        print(f"Saved plot: {plot_path}")
        
        # Plot Scatter of Actual vs Predicted
        plt.figure(figsize=(6, 6))
        plt.scatter(y_test[:, i], y_pred[:, i], alpha=0.5)
        plt.plot([y_test[:, i].min(), y_test[:, i].max()],
                 [y_test[:, i].min(), y_test[:, i].max()],
                 'r--', lw=2)
        plt.title(f'Actual vs Predicted Irradiance ({minutes} min Ahead)')
        plt.xlabel('Actual Irradiance (W/m²)')
        plt.ylabel('Predicted Irradiance (W/m²)')
        plt.tight_layout()
        
        # Save the scatter plot
        scatter_filename = f'actual_vs_predicted_{minutes}_min_ahead.png'
        scatter_path = os.path.join(run_folder, scatter_filename)
        plt.savefig(scatter_path)
        plt.close()
        print(f"Saved scatter plot: {scatter_path}")
    
    # -----------------------------
    # 7. Calculate and Save Overall Metrics and Plots
    # -----------------------------
    
    # Calculate overall performance metrics across all horizons
    overall_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    overall_mae = mean_absolute_error(y_test, y_pred)
    overall_r2 = r2_score(y_test, y_pred)
    overall_metric_str = f"Overall Performance - RMSE: {overall_rmse:.2f}, MAE: {overall_mae:.2f}, R²: {overall_r2:.2f}"
    print(overall_metric_str)
    
    # Append overall metrics to the text file
    with open(metrics_file, 'a') as f:
        f.write("\n" + overall_metric_str + "\n")
    
    # Plot Overall Actual vs Predicted
    plt.figure(figsize=(6,6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()],
             [y_test.min(), y_test.max()],
             'r--', lw=2)
    plt.title('Actual vs Predicted Irradiance (Overall)')
    plt.xlabel('Actual Irradiance (W/m²)')
    plt.ylabel('Predicted Irradiance (W/m²)')
    plt.tight_layout()
    
    # Save the overall scatter plot
    overall_scatter_filename = 'actual_vs_predicted_overall.png'
    overall_scatter_path = os.path.join(run_folder, overall_scatter_filename)
    plt.savefig(overall_scatter_path)
    plt.close()
    print(f"Saved overall scatter plot: {overall_scatter_path}")

In [57]:
evaluate_model(model, test_data, sequence_length=60)

Created run-specific directory at 'evaluation_plots/run_20241119_211151'.
Created 65 sequences for evaluation.
Evaluation split: 65 samples.
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 204ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
Generated predictions for the test set.
5-Minute Ahead Prediction - RMSE: 143.93, MAE: 133.36, R²: -34.46
Saved plot: evaluation_plots/run_20241119_211151/5_min_ahead_prediction.png
Saved scatter plot: evaluation_plots/run_20241119_211151/actual_vs_predicted_5_min_ahead.png
15-Minute Ahead Prediction - RMSE: 118.69, MAE: 107.22, R²: -10.87
Saved plot: evaluation_plots/run_20241119_211151/15_min_ahead_prediction.png
Saved scatter plot: evaluation_plots/run_20241119_211151/actual_vs_predicted_15_min_ahead.png
30-Minute Ahead Prediction - RMSE: 108.67, MAE: 95.90, R²: -4.32
Saved plot: evaluation_plots/run_20241119_211151/30_min_ahead_prediction.png
Saved scatter plot: evaluation_plots/run_20241119_211151/actual_vs_predicted_30_min_ahead.png
60-Minute Ahead Prediction - RMSE: 71.20, MAE: 51.93, R²: -0.63
Saved plot: evaluation_plots/run_20241119_211151/60_min_ahead_prediction.png
Saved scatter plot: evaluation_plots/