In [1]:
import torch
from torch import nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os

# Additional metric for MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Load and preprocess the air pollution data
def load_and_preprocess_data(file_path, target_feature):
    data = pd.read_csv(file_path)

    # Correct the date format using dayfirst=True
    data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y %H:%M', dayfirst=True, errors='coerce')

    # Drop rows with invalid or missing date values
    data = data.dropna(subset=['Date'])

    # Set the 'Date' column as the index
    features = data.copy()
    features.set_index('Date', inplace=True)

    # Ensure the target feature exists
    if target_feature not in features.columns:
        raise ValueError(f"Target column '{target_feature}' not found in the dataset.")

    # Extract all feature columns except the target
    feature_columns = features.columns 

    # Scale the data
    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    # Fit and transform the feature data
    features_scaled = feature_scaler.fit_transform(features[feature_columns])

    # Fit and transform the target data
    target_scaled = target_scaler.fit_transform(features[[target_feature]])

    return features_scaled, target_scaled, feature_scaler, target_scaler

# Prepare the dataset for the model input (general structure for time series)
def create_dataset(features, target, seq_length):
    X = []
    y = []

    for i in range(len(features) - seq_length):
        X.append(features[i:i+seq_length].flatten())
        y.append(target[i+seq_length])

    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Define the TimesFM Model (placeholder for demonstration)
class TimesFMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(TimesFMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.freq_layer = nn.Linear(hidden_size, output_size)  # Hypothetical frequency layer

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last time step
        output = self.fc(lstm_out)
        return output

# Training the model
def train_model(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=4, lr=0.001, output_dir="loss_results", target_feature=""):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    # Create lists to store the training and validation loss values for each epoch
    train_loss_values = []
    val_loss_values = []

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        # Forward pass for training data
        y_pred_train = model(X_train)

        # Compute training loss
        train_loss = loss_fn(y_pred_train.squeeze(), y_train)
        train_loss.backward()

        # Update weights
        optimizer.step()

        # Append the training loss to the list
        train_loss_values.append(train_loss.item())

        # Evaluate on validation data
        model.eval()
        with torch.no_grad():
            y_pred_val = model(X_val)
            val_loss = loss_fn(y_pred_val.squeeze(), y_val)

        # Append the validation loss to the list
        val_loss_values.append(val_loss.item())

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss.item()}, Val Loss: {val_loss.item()}")

    # Save the training and validation loss values to a CSV file
    loss_df = pd.DataFrame({
        "Epoch": list(range(1, epochs+1)),
        "Train Loss": train_loss_values,
        "Validation Loss": val_loss_values
    })
    loss_csv_path = os.path.join(output_dir, f"{target_feature}_losses.csv")
    loss_df.to_csv(loss_csv_path, index=False)
    print(f"Losses saved to {loss_csv_path}")

# Testing the model
def test_model(model, X_test, target_scaler):
    model.eval()

    with torch.no_grad():
        predictions = model(X_test).squeeze()

        # Ensure non-negative predictions
        predictions = torch.relu(predictions).numpy()

        # Rescale predictions back to original scale
        predictions_rescaled = target_scaler.inverse_transform(predictions.reshape(-1, 1))

        return predictions_rescaled

# Evaluate model and store results
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, rmse, mae, mape, r2

# Save results to CSV
def save_results(predictions, actual_values, feature_name, output_dir):
    results = pd.DataFrame({
        "Predictions": predictions.flatten(),
        "Actual": actual_values.flatten()
    })
    results.to_csv(os.path.join(output_dir, f"{feature_name}_predictions.csv"), index=False)
    print(f"Results saved for {feature_name}")

# Main function to load data, train, test, and evaluate the model
if __name__ == "__main__":
    # Define the output directory for results
    output_dir = r"E:/Q/RESULTS"   # Directory to store results
    os.makedirs(output_dir, exist_ok=True)

    # Load and preprocess data
    file_path = r"E:/Q/Q_DATA/pm_sr.csv"  # Update file path for multivariate data
    target_features = ['PM2.5', 'PM10', 'RH', 'SR']  # List of all features to predict

    seq_length = 12  # Example sequence length
    hidden_size = 64  # Hidden size for LSTM
    num_layers = 2  # Number of layers in the LSTM
    output_size = 1  # Predicting single value

    results = []

    for target_feature in target_features:
        print(f"Training and evaluating model for target: {target_feature}")

        features_scaled, target_scaled, feature_scaler, target_scaler = load_and_preprocess_data(file_path, target_feature)

        # Prepare dataset
        X, y = create_dataset(features_scaled, target_scaled, seq_length)

        # Reshape for LSTM input [batch_size, seq_length, input_size]
        X = X.view(X.shape[0], seq_length, -1)

        # Split data into training, validation, and test sets
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 50% of remaining 30% for val/test

        # Initialize and train the TimesFM model
        input_size = X_train.shape[2]  # The number of input features
        model = TimesFMModel(input_size=input_size, hidden_size=hidden_size, output_size=output_size, num_layers=num_layers)
        train_model(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=4, lr=0.001, output_dir=output_dir, target_feature=target_feature)

        # Test the model
        predictions = test_model(model, X_test, target_scaler)

        # Rescale actual values back to original scale
        y_test_rescaled = target_scaler.inverse_transform(y_test.numpy().reshape(-1, 1))

        # Save predictions and actual values to CSV
        save_results(predictions, y_test_rescaled, target_feature, output_dir)

        # Evaluate the model
        mse, rmse, mae, mape, r2 = evaluate_model(y_test_rescaled, predictions)

        # Append results
        results.append({
            'Target Feature': target_feature,
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'MAPE': mape,
            'R2': r2
        })

    # Save overall evaluation results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(output_dir, f"model_evaluation_results_EVAL.csv"), index=False)

    print("Evaluation results saved to 'model_evaluation_results_EVAL.csv'")


Training and evaluating model for target: PM2.5


FileNotFoundError: [Errno 2] No such file or directory: 'E:/Q/Q_DATA/pm_sr.csv'