In [None]:
! pip install tensorflow
! pip install pandas
! pip install scikit-learn
! pip install opencv-python

In [27]:
# preprocessing/data_preprocessing.py

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import joblib

def preprocess_meteorological_data(file_path):
    # Load data
    met_data = pd.read_csv(file_path)

    # Combine DATE and MST into a single datetime column
    # met_data['Timestamp'] = pd.to_datetime(met_data['DATE'] + ' ' + met_data['MST'])
    # Combine DATE and MST into a single datetime column
    met_data['Timestamp'] = pd.to_datetime(met_data['datetime'].astype(str), format='%Y%m%d%H%M%S')

    # Sort data by Timestamp
    met_data.sort_values('Timestamp', inplace=True)

    # Handle missing values in input features using KNN imputation
    input_features = [
        'Tower Dry Bulb Temp [deg C]', 'Tower RH [%]', 'Station Pressure [mBar]',
        'Avg Wind Speed @ 6ft [m/s]', 'Avg Wind Direction @ 6ft [deg from N]'
    ]

    # Initialize KNN imputer
    imputer = KNNImputer(n_neighbors=5)

    # Fit and transform the input features
    met_data_imputed = imputer.fit_transform(met_data[input_features])

    # Update the DataFrame with imputed values
    met_data[input_features] = met_data_imputed

    # Handle missing values in the target variable separately
    target_variable = 'Global CMP22 (vent/cor) [W/m^2]'

    # Optionally interpolate missing target values
    # met_data[target_variable].interpolate(method='time', inplace=True)
    
    # Option 2: Drop rows with missing target values (uncomment if preferred)
    met_data.dropna(subset=[target_variable], inplace=True)

    # Rename columns for simplicity
    met_data.rename(columns={
        'Tower Dry Bulb Temp [deg C]': 'Temperature',
        'Tower RH [%]': 'Humidity',
        'Station Pressure [mBar]': 'Pressure',
        'Avg Wind Speed @ 6ft [m/s]': 'Wind Speed',
        'Avg Wind Direction @ 6ft [deg from N]': 'Wind Direction',
        'Global CMP22 (vent/cor) [W/m^2]': 'Irradiance'
    }, inplace=True)

    # Feature scaling for input features
    scaler = MinMaxScaler()
    met_data[['Temperature', 'Humidity', 'Pressure', 'Wind Speed']] = scaler.fit_transform(
        met_data[['Temperature', 'Humidity', 'Pressure', 'Wind Speed']]
    )
    joblib.dump(scaler, 'scaler_y.pkl')

    # Wind Direction encoding (convert degrees to sine and cosine components)
    met_data['Wind Dir Sin'] = np.sin(np.deg2rad(met_data['Wind Direction']))
    met_data['Wind Dir Cos'] = np.cos(np.deg2rad(met_data['Wind Direction']))
    met_data.drop('Wind Direction', axis=1, inplace=True)

    # Temporal features
    met_data['Hour'] = met_data['Timestamp'].dt.hour / 23.0  # Normalize Hour
    met_data['DayOfYear'] = met_data['Timestamp'].dt.dayofyear / 365.0  # Normalize DayOfYear

    # Prepare target variables (future irradiance)
    target = 'Irradiance'
    for minutes in [5, 15, 30, 60]:
        met_data[f'Irradiance_{minutes}min_ahead'] = met_data[target].shift(-minutes)

    # **Removed the line that drops rows with NaN values after shifting**
    # We will handle dropping NaN values after merging with images
    # met_data.dropna(inplace=True)

    # Reset index and return the processed DataFrame
    return met_data.reset_index(drop=True)

In [None]:
# preprocessing/image_preprocessing.py

import cv2
import numpy as np
import glob
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from functools import partial

def process_image(path):
    """Process a single image"""
    try:
        # Extract timestamp from filename
        filename = os.path.basename(path)
        timestamp_str = filename.replace('.jpg', '')
        timestamp = pd.to_datetime(timestamp_str, format='%Y%m%d%H%M%S')
        
        # Read and process image
        img = cv2.imread(path)
        if img is None:
            return None
        
        img = cv2.resize(img, (128, 128))
        img = img / 255.0
        
        return (img, timestamp)
    except Exception as e:
        print(f"Error processing {path}: {str(e)}")
        return None

def preprocess_images(image_folder, n_workers=None):
    """Parallel image preprocessing"""
    if n_workers is None:
        n_workers = os.cpu_count()
    
    # Get sorted image paths
    image_paths = sorted(glob.glob(os.path.join(image_folder, '*.jpg')))
    
    # Process images in parallel
    with ProcessPoolExecutor(max_workers=n_workers) as executor:
        results = list(tqdm(
            executor.map(process_image, image_paths),
            total=len(image_paths),
            desc="Processing images"
        ))
    
    # Filter out None results and unzip the valid results
    valid_results = [r for r in results if r is not None]
    images, timestamps = zip(*valid_results) if valid_results else ([], [])
    
    return list(images), list(timestamps)


In [None]:
met_data = preprocess_meteorological_data('data/combined_data.csv')
met_data

Unnamed: 0,datetime,Irradiance,Temperature,Humidity,Wind Speed,Pressure,Timestamp,Wind Dir Sin,Wind Dir Cos,Hour,DayOfYear,Irradiance_5min_ahead,Irradiance_15min_ahead,Irradiance_30min_ahead,Irradiance_60min_ahead
0,20241116065000,8.16102,0.160293,0.701203,0.178161,0.493881,2024-11-16 06:50:00,-0.761538,0.648120,0.260870,0.879452,14.6895,31.3017,66.4804,152.526
1,20241116065100,9.39915,0.167970,0.682487,0.211264,0.495752,2024-11-16 06:51:00,-0.814116,0.580703,0.260870,0.879452,16.1444,33.2346,69.2944,155.594
2,20241116065200,10.62110,0.174731,0.692848,0.156667,0.495596,2024-11-16 06:52:00,-0.719340,0.694658,0.260870,0.879452,17.6113,35.2643,72.1972,158.274
3,20241116065300,11.92040,0.174942,0.698529,0.103793,0.495830,2024-11-16 06:53:00,-0.846193,0.532876,0.260870,0.879452,19.0965,37.5257,74.9238,161.364
4,20241116065400,13.26040,0.173885,0.697193,0.100805,0.496220,2024-11-16 06:54:00,-0.931056,0.364877,0.260870,0.879452,20.6639,39.6576,77.6095,164.285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1766,20241118163600,6.95426,0.580252,0.399733,0.436782,0.138748,2024-11-18 16:36:00,-0.979223,-0.202787,0.695652,0.884932,,,,
1767,20241118163700,6.47514,0.580956,0.403743,0.428161,0.146075,2024-11-18 16:37:00,-0.993961,-0.109734,0.695652,0.884932,,,,
1768,20241118163800,6.01318,0.577435,0.409425,0.288851,0.147011,2024-11-18 16:38:00,-0.996637,0.081939,0.695652,0.884932,,,,
1769,20241118163900,5.64745,0.574618,0.411096,0.218391,0.150596,2024-11-18 16:39:00,-0.990748,0.135716,0.695652,0.884932,,,,


In [None]:
images, image_timestamps = preprocess_images('data/training_images/')
len(image_timestamps)

Training data contains 37 records.
Testing data contains 10 records.


Unnamed: 0,datetime,Irradiance,Temperature,Humidity,Wind Speed,Pressure,Timestamp,Wind Dir Sin,Wind Dir Cos,Hour,DayOfYear,Irradiance_5min_ahead,Irradiance_15min_ahead,Irradiance_30min_ahead,Irradiance_60min_ahead,Image
37,20241117134000,473.125,0.754208,0.069853,0.341954,0.768727,2024-11-17 13:40:00,0.596225,-0.802817,0.565217,0.882192,360.821,72.0372,308.907,338.085,"[[[0.12549019607843137, 0.12549019607843137, 0..."
38,20241117135000,454.147,0.754912,0.079545,0.196897,0.762413,2024-11-17 13:50:00,0.376224,-0.926529,0.565217,0.882192,334.077,16.3083,348.337,290.025,"[[[0.14901960784313725, 0.14901960784313725, 0..."
39,20241117140000,432.693,0.775336,0.060829,0.310345,0.75532,2024-11-17 14:00:00,0.658689,-0.752415,0.608696,0.882192,306.988,10.7664,340.34,214.875,"[[[0.13333333333333333, 0.13333333333333333, 0..."
40,20241117141000,409.844,0.769702,0.051136,0.211264,0.753839,2024-11-17 14:10:00,0.584958,-0.811064,0.608696,0.882192,279.045,5.72718,361.599,166.499,"[[[0.14901960784313725, 0.14901960784313725, 0..."
41,20241117142000,385.446,0.774632,0.065842,0.198276,0.75267,2024-11-17 14:20:00,0.782608,-0.622515,0.608696,0.882192,250.542,7.27595,410.749,130.699,"[[[0.13725490196078433, 0.13725490196078433, 0..."
42,20241117143000,360.821,0.749278,0.05615,0.231379,0.75228,2024-11-17 14:30:00,0.72176,-0.692143,0.608696,0.882192,220.526,11.9655,430.627,100.118,"[[[0.1450980392156863, 0.1450980392156863, 0.1..."
43,20241117144000,334.077,0.766885,0.047794,0.255747,0.742458,2024-11-17 14:40:00,0.722967,-0.690882,0.608696,0.882192,190.616,25.3018,447.64,70.0308,"[[[0.1411764705882353, 0.1411764705882353, 0.1..."
44,20241117145000,306.988,0.762659,0.051136,0.117011,0.744173,2024-11-17 14:50:00,0.812084,-0.583541,0.608696,0.882192,160.296,78.0141,465.999,16.0786,"[[[0.13725490196078433, 0.13725490196078433, 0..."
45,20241117150000,279.045,0.783788,0.055147,0.152299,0.73708,2024-11-17 15:00:00,0.469472,-0.882948,0.652174,0.882192,129.766,114.07,481.154,10.1133,"[[[0.14901960784313725, 0.14901960784313725, 0..."
46,20241117151000,250.542,0.763364,0.049799,0.235632,0.734352,2024-11-17 15:10:00,0.783693,-0.621148,0.652174,0.882192,100.39,143.951,496.807,5.26882,"[[[0.14901960784313725, 0.14901960784313725, 0..."


In [None]:
# training/train_model.py

import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from models.hybrid import create_hybrid_model
import pandas as pd

def train_hybrid_model(met_data, images, image_timestamps):
    # Prepare data
    sequence_length = 60  # Number of past minutes to consider
    features = ['Temperature', 'Humidity', 'Pressure', 'Wind Speed', 'Wind Dir Sin', 'Wind Dir Cos',
                'Hour', 'DayOfYear']

    # Align images with meteorological data
    merged_data = align_data_with_images(met_data, images, image_timestamps)

    # Extract features and targets
    X_num = merged_data[features].values
    y = merged_data[[f'Irradiance_{minutes}min_ahead' for minutes in [5, 15, 30, 60]]].values
    X_img = np.array(merged_data['Image'].tolist())

    # Create sequences
    def create_sequences(X_num, X_img, y, seq_length):
        X_num_seq, X_img_seq, y_seq = [], [], []
        for i in range(len(X_num) - seq_length):
            X_num_seq.append(X_num[i:i+seq_length])
            X_img_seq.append(X_img[i+seq_length-1])  # Use image at last timestamp
            y_seq.append(y[i+seq_length-1])
        return np.array(X_num_seq), np.array(X_img_seq), np.array(y_seq)

    X_num_seq, X_img_seq, y_seq = create_sequences(X_num, X_img, y, sequence_length)

    # Train-test split
    split_index = int(0.8 * len(X_num_seq))
    X_num_train, X_num_test = X_num_seq[:split_index], X_num_seq[split_index:]
    X_img_train, X_img_test = X_img_seq[:split_index], X_img_seq[split_index:]
    y_train, y_test = y_seq[:split_index], y_seq[split_index:]

    # Create model
    num_features = X_num_train.shape[2]
    model = create_hybrid_model(sequence_length, num_features)

    # Compile model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

    # Callbacks
    early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(factor=0.5, patience=3)

    # Train model
    history = model.fit(
        [X_img_train, X_num_train],
        y_train,
        validation_data=([X_img_test, X_num_test], y_test),
        epochs=50,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr]
    )

    return model, history, merged_data

def align_data_with_images(met_data, images, image_timestamps):
    # Create a DataFrame for image timestamps
    image_df = pd.DataFrame({'Timestamp': image_timestamps, 'Image': images})
    image_df['Timestamp'] = pd.to_datetime(image_df['Timestamp'])

    # Merge meteorological data with images using an inner join
    met_data['Timestamp'] = pd.to_datetime(met_data['Timestamp'])
    merged_data = pd.merge(met_data, image_df, on='Timestamp', how='inner')

    # Prepare target variables (future irradiance)
    target = 'Irradiance'
    for minutes in [5, 15, 30, 60]:
        merged_data[f'Irradiance_{minutes}min_ahead'] = merged_data[target].shift(-minutes)

    # Drop rows with any remaining missing values (after shifting)
    merged_data.dropna(inplace=True)

    # Reset index
    merged_data.reset_index(drop=True, inplace=True)

    return merged_data

In [None]:
# Train the model
model, history, merged_data = train_hybrid_model(met_data, images, image_timestamps)

# Save the model
model.save('trainedModels/test.keras')

IndexError: tuple index out of range

In [None]:
# evaluation/evaluate_model.py

import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import os
from datetime import datetime

def evaluate_model(model, merged_data):
    """
    Evaluates the regression model on the provided merged_data.
    Saves evaluation plots in a uniquely named subfolder within 'evaluation_plots'.
    
    Parameters:
    - model: Trained Keras model.
    - merged_data: pandas DataFrame containing meteorological data and associated images.
    """
    
    # -----------------------------
    # 1. Setup Directory for Saving Plots
    # -----------------------------
    
    # Define the base directory for evaluation plots
    base_dir = 'evaluation_plots'
    
    # Create the base directory if it doesn't exist
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
        print(f"Created base directory for evaluation plots at '{base_dir}'.")
    
    # Generate a unique subfolder name using the current timestamp
    run_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    run_folder = os.path.join(base_dir, f'run_{run_timestamp}')
    
    # Create the subfolder
    os.makedirs(run_folder, exist_ok=True)
    print(f"Created run-specific directory at '{run_folder}'.")
    
    # -----------------------------
    # 2. Prepare Data for Evaluation
    # -----------------------------
    
    # Define the sequence length and feature columns
    sequence_length = 60
    features = [
        'Temperature', 'Humidity', 'Pressure', 'Wind Speed',
        'Wind Dir Sin', 'Wind Dir Cos', 'Hour', 'DayOfYear'
    ]  # Exclude direct current irradiance as a feature

    # Extract feature values and target variables
    X_num = merged_data[features].values
    y = merged_data[[f'Irradiance_{minutes}min_ahead' for minutes in [5, 15, 30, 60]]].values
    X_img = np.array(merged_data['Image'].tolist())

    # -----------------------------
    # 3. Create Sequences
    # -----------------------------
    
    def create_sequences(X_num, X_img, y, seq_length):
        """
        Creates input sequences for the model.
        
        Parameters:
        - X_num: Numpy array of numerical features.
        - X_img: Numpy array of images.
        - y: Numpy array of target variables.
        - seq_length: Length of the input sequences.
        
        Returns:
        - Tuple of Numpy arrays: (X_num_seq, X_img_seq, y_seq)
        """
        X_num_seq, X_img_seq, y_seq = [], [], []
        for i in range(len(X_num) - seq_length):
            X_num_seq.append(X_num[i:i+seq_length])
            X_img_seq.append(X_img[i+seq_length-1])  # Use image at the last timestamp
            y_seq.append(y[i+seq_length-1])
        return np.array(X_num_seq), np.array(X_img_seq), np.array(y_seq)

    # Generate sequences
    X_num_seq, X_img_seq, y_seq = create_sequences(X_num, X_img, y, sequence_length)
    print(f"Created {len(X_num_seq)} sequences for evaluation.")

    # -----------------------------
    # 4. Split Data into Test Set
    # -----------------------------
    
    # Define the split index for the last 20% as the test set
    split_index = int(0.99 * len(X_num_seq))
    
    # Split the data
    X_num_test = X_num_seq[split_index:]
    X_img_test = X_img_seq[split_index:]
    y_test = y_seq[split_index:]
    
    print(f"Evaluation split: {len(X_num_test)} samples.")

    # -----------------------------
    # 5. Make Predictions
    # -----------------------------
    
    # Generate predictions using the trained model
    y_pred = model.predict([X_img_test, X_num_test])
    print("Generated predictions for the test set.")

    # -----------------------------
    # 6. Calculate and Save Metrics and Plots
    # -----------------------------
    
    horizons = [5, 15, 30, 60]  # Prediction horizons in minutes
    
    # Initialize a text file to save metrics
    metrics_file = os.path.join(run_folder, 'metrics.txt')
    with open(metrics_file, 'w') as f:
        f.write("Evaluation Metrics:\n")
        f.write("===================\n\n")
    
    for i, minutes in enumerate(horizons):
        # Calculate metrics for each horizon
        rmse = np.sqrt(mean_squared_error(y_test[:, i], y_pred[:, i]))
        mae = mean_absolute_error(y_test[:, i], y_pred[:, i])
        r2 = r2_score(y_test[:, i], y_pred[:, i])
        metric_str = f"{minutes}-Minute Ahead Prediction - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}"
        print(metric_str)
        
        # Append metrics to the text file
        with open(metrics_file, 'a') as f:
            f.write(metric_str + "\n")
        
        # Plot Actual vs Predicted
        plt.figure(figsize=(10, 4))
        plt.plot(y_test[:, i], label='Actual', alpha=0.7)
        plt.plot(y_pred[:, i], label='Predicted', alpha=0.7)
        plt.title(f'{minutes}-Minute Ahead Prediction')
        plt.xlabel('Samples')
        plt.ylabel('Irradiance (W/m²)')
        plt.legend()
        plt.tight_layout()
        
        # Save the plot
        plot_filename = f'{minutes}_min_ahead_prediction.png'
        plot_path = os.path.join(run_folder, plot_filename)
        plt.savefig(plot_path)
        plt.close()
        print(f"Saved plot: {plot_path}")
        
        # Plot Scatter of Actual vs Predicted
        plt.figure(figsize=(6, 6))
        plt.scatter(y_test[:, i], y_pred[:, i], alpha=0.5)
        plt.plot([y_test[:, i].min(), y_test[:, i].max()],
                 [y_test[:, i].min(), y_test[:, i].max()],
                 'r--', lw=2)
        plt.title(f'Actual vs Predicted Irradiance ({minutes} min Ahead)')
        plt.xlabel('Actual Irradiance (W/m²)')
        plt.ylabel('Predicted Irradiance (W/m²)')
        plt.tight_layout()
        
        # Save the scatter plot
        scatter_filename = f'actual_vs_predicted_{minutes}_min_ahead.png'
        scatter_path = os.path.join(run_folder, scatter_filename)
        plt.savefig(scatter_path)
        plt.close()
        print(f"Saved scatter plot: {scatter_path}")
    
    # -----------------------------
    # 7. Calculate and Save Overall Metrics and Plots
    # -----------------------------
    
    # Calculate overall performance metrics across all horizons
    overall_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    overall_mae = mean_absolute_error(y_test, y_pred)
    overall_r2 = r2_score(y_test, y_pred)
    overall_metric_str = f"Overall Performance - RMSE: {overall_rmse:.2f}, MAE: {overall_mae:.2f}, R²: {overall_r2:.2f}"
    print(overall_metric_str)
    
    # Append overall metrics to the text file
    with open(metrics_file, 'a') as f:
        f.write("\n" + overall_metric_str + "\n")
    
    # Plot Overall Actual vs Predicted
    plt.figure(figsize=(6,6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()],
             [y_test.min(), y_test.max()],
             'r--', lw=2)
    plt.title('Actual vs Predicted Irradiance (Overall)')
    plt.xlabel('Actual Irradiance (W/m²)')
    plt.ylabel('Predicted Irradiance (W/m²)')
    plt.tight_layout()
    
    # Save the overall scatter plot
    overall_scatter_filename = 'actual_vs_predicted_overall.png'
    overall_scatter_path = os.path.join(run_folder, overall_scatter_filename)
    plt.savefig(overall_scatter_path)
    plt.close()
    print(f"Saved overall scatter plot: {overall_scatter_path}")

In [None]:
evaluate_model(model, merged_data)