## Visualization of Results of a Model

This notebook visualizes the performance of different machine learning models based on their predictions and evaluation metrics.

The visualizations include:
- **True vs. Predicted scatter plots** to assess how closely predictions align with actual values.
- **Time series plots** showing the evolution of true and predicted values over time.
- **Residual analysis** via histograms and residual vs. predicted scatter plots to evaluate model errors.
- **Bar plots of performance metrics** (R², RMSE, MAE) across regions and datasets (train/test) for comparison.

These plots are generated automatically for each model and time period from pre-saved CSV files containing prediction results and evaluation metrics.

In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def safe_str(s):
    """Creates a safe string for filenames and directory names.

    Args:
        s: The input string or NaN.

    Returns:
        A sanitized string safe for use in file or directory names.
    """
    if pd.isnull(s):
        return "unknown"
    s = str(s)
    return (s.replace(' ', '_')
              .replace('–', '-')
              .replace('ä', 'ae')
              .replace('ö', 'oe')
              .replace('ü', 'ue')
              .replace('ß', 'ss'))

# Configuration
PREDICTION_DIR = "../results/predictions/"
BASE_OUTPUT_DIR = "../results/figures/"
METRIC_FILE = "../results/model_comparison.csv"
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

# Load prediction CSV files
csv_files = glob.glob(os.path.join(PREDICTION_DIR, "*.csv"))
df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
df["model"] = df["model"].astype(str)
df["periode"] = df["periode"].astype(str)

def plot_true_vs_pred(df, path):
    """Scatter plot of true values vs. predicted values.

    Args:
        df: DataFrame containing 'true' and 'prediction' columns.
        path: File path to save the plot.
    """
    plt.figure(figsize=(6, 6))
    sns.scatterplot(x='true', y='prediction', data=df, alpha=0.5)
    minval = min(df['true'].min(), df['prediction'].min())
    maxval = max(df['true'].max(), df['prediction'].max())
    plt.plot([minval, maxval], [minval, maxval], 'r--', label="Ideal")
    plt.xlabel("True")
    plt.ylabel("Prediction")
    plt.title("True vs. Prediction")
    plt.legend()
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

def plot_true_pred_over_time(df, path):
    """Time series plot comparing true and predicted values.

    Args:
        df: DataFrame with 'time', 'true', and 'prediction' columns.
        path: File path to save the plot.
    """
    plt.figure(figsize=(14, 5))
    plt.plot(pd.to_datetime(df['time']), df['true'], label='True', linewidth=2)
    plt.plot(pd.to_datetime(df['time']),
             df['prediction'],
             label='Prediction',
             linestyle='--',
             linewidth=2)
    plt.xlabel("Time")
    plt.ylabel("N₂O")
    plt.title("Time Series: True vs. Prediction")
    plt.legend()
    plt.tight_layout()
    plt.savefig(path)
    plt.close()


def plot_residual_hist(df, path):
    """Histogram of residuals (true - prediction).

    Args:
        df: DataFrame with 'true' and 'prediction' columns.
        path: File path to save the plot.
    """
    residuals = df['true'] - df['prediction']
    plt.figure(figsize=(8, 4))
    sns.histplot(residuals, kde=True, bins=30, color='purple')
    plt.xlabel("Residual (True - Prediction)")
    plt.title("Histogram of Residuals")
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

def plot_residual_vs_pred(df, path):
    """Plot residuals against predicted values.

    Args:
        df: DataFrame with 'true' and 'prediction' columns.
        path: File path to save the plot.
    """
    residuals = df['true'] - df['prediction']
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=df['prediction'], y=residuals, alpha=0.5)
    plt.axhline(0, color='r', linestyle='--')
    plt.xlabel("Prediction")
    plt.ylabel("Residual (True - Prediction)")
    plt.title("Residuals vs. Prediction")
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

# Generate plots for each model and period
for model in sorted(df["model"].unique()):
    model_safe = safe_str(model)
    model_output_dir = os.path.join(BASE_OUTPUT_DIR, model_safe)
    os.makedirs(model_output_dir, exist_ok=True)

    for period in sorted(df["periode"].unique()):
        df_sel = df[(df["model"] == model) & (df["periode"] == period)]
        if df_sel.empty:
            continue
        period_safe = safe_str(period)
        suffix = f"{model_safe}_{period_safe}"
        plot_true_vs_pred(df_sel, os.path.join(model_output_dir, f"true_vs_pred_{suffix}.png"))
        plot_true_pred_over_time(df_sel, os.path.join(model_output_dir, f"time_series_{suffix}.png"))
        plot_residual_hist(df_sel, os.path.join(model_output_dir, f"residual_hist_{suffix}.png"))
        plot_residual_vs_pred(df_sel, os.path.join(model_output_dir, f"residual_vs_pred_{suffix}.png"))
        print(f"Plots for {model}, {period} saved.")

# Comparison plots (R2, RMSE, MAE)
df_metric = pd.read_csv(METRIC_FILE)

METRICS = [
    ("R2", "Train R2", "Test R2"),
    ("RMSE", "Train RMSE", "Test RMSE"),
    ("MAE", "Train MAE", "Test MAE")
]

for model in df_metric["Modell"].unique():
    model_safe = safe_str(model)
    model_output_dir = os.path.join(BASE_OUTPUT_DIR, model_safe)
    os.makedirs(model_output_dir, exist_ok=True)

    df_model_metric = df_metric[df_metric["Modell"] == model]
    if df_model_metric.empty:
        continue

    for metric_name, train_col, test_col in METRICS:
        df_plot = df_model_metric.melt(
            id_vars=["Bereich", "Modell"],
            value_vars=[train_col, test_col],
            var_name="Dataset",
            value_name=metric_name
        )
        df_plot["Dataset"] = df_plot["Dataset"].apply(
            lambda x: "Train" if "Train" in x else "Test"
        )

        plt.figure(figsize=(10, 5))
        sns.barplot(
            data=df_plot,
            x="Bereich",
            y=metric_name,
            hue="Dataset",
            ci=None,
            palette={"Train": "skyblue", "Test": "salmon"}
        )
        plt.title(f"{model}: Train and Test {metric_name}")
        plt.xlabel("Region")
        plt.ylabel(metric_name)
        plt.legend(title="Dataset")
        plt.tight_layout()

        filename = os.path.join(
            model_output_dir,
            f"model_comparison_{metric_name.lower()}.png"
        )
        plt.savefig(filename)
        plt.close()
        print(f"Model comparison for {model}, {metric_name} saved to {filename}")

Plots for KNN, Januar–März saved.
Plots for KNN, Mai–Oktober saved.
Plots for KNN, März–Mitte Mai saved.
Model comparison for KNN, R2 saved to ../results/figures/KNN\model_comparison_r2.png



The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(


Model comparison for KNN, RMSE saved to ../results/figures/KNN\model_comparison_rmse.png
Model comparison for KNN, MAE saved to ../results/figures/KNN\model_comparison_mae.png



The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
