In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import plotly.graph_objects as go

In [2]:

engine = create_engine("postgresql+psycopg2://weather_user:supersecret@localhost:5432/weatherdb")

In [3]:
metrics = [
    'temp_c', 'wind_kph', 'humidity', 'precip_mm',
    'pressure_mb', 'cloud', 'feelslike_c'
]

In [4]:
def compute_errors(df, metrics):
    """Compute MAE and RMSE errors for each metric."""
    summary = {}
    for metric in metrics:
        actual_col = f"{metric}_actual"
        forecast_col = f"{metric}_forecast"
        error_col = f"error_{metric}"

        if actual_col in df.columns and forecast_col in df.columns:
            df[error_col] = df[actual_col] - df[forecast_col]
            summary[metric] = {
                "mae": df[error_col].abs().mean(),
                "rmse": (df[error_col] ** 2).mean() ** 0.5,
            }
    return df, summary


def plot_forecast_vs_actual(df, metrics, city_filter=None):
    """Plot actual vs forecast values for each metric using Matplotlib."""
    filtered_df = df.copy()
    if city_filter:
        filtered_df = filtered_df[filtered_df['city'] == city_filter]

    if filtered_df.empty:
        print(f"No data for city: {city_filter}")
        return

    for metric in metrics:
        actual_col = f"{metric}_actual"
        forecast_col = f"{metric}_forecast"
        if actual_col in filtered_df.columns and forecast_col in filtered_df.columns:
            subset = filtered_df[['as_of_hour', actual_col, forecast_col]].dropna()
            if subset.empty:
                print(f"No data for metric: {metric}")
                continue
            plt.figure(figsize=(12, 5))
            plt.plot(subset['as_of_hour'], subset[actual_col], label=f"Actual {metric}")
            plt.plot(subset['as_of_hour'], subset[forecast_col], label=f"Forecast {metric}")
            plt.title(f"{metric.capitalize()} - Forecast vs Actual ({city_filter or 'All Cities'})")
            plt.xlabel("Time")
            plt.ylabel(metric.capitalize())
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()


def analyze_forecast_error_trends(df, metrics, city_filter=None):
    """Analyze error trends for each metric, detect turning points and linear trends."""
    TURN_PARAMETER = 2
    EPS = 1e-3

    filtered_df = df.copy()
    if city_filter:
        filtered_df = filtered_df[filtered_df['city'] == city_filter]

    for metric in metrics:
        error_col = f"error_{metric}"
        if error_col not in filtered_df.columns:
            continue

        series = filtered_df[error_col].dropna().reset_index(drop=True)
        if series.empty:
            print(f"No error data for {metric}")
            continue

        print(f"\nAnalyzing error trend for: {metric}")
        values = series.tolist()
        turning_points = []

        for i in range(1, len(values) - 2):
            next_val, next_next_val = values[i + 1] + EPS, values[i + 2] + EPS
            if abs(values[i] / next_val) > TURN_PARAMETER and abs(values[i] / next_next_val) > TURN_PARAMETER:
                turning_points.append(i + 1)

        print(f"Turning points: {turning_points}")

        segments = [values[:turning_points[0]]] if turning_points else [values]
        for i in range(len(turning_points)):
            start = turning_points[i]
            end = turning_points[i + 1] if i + 1 < len(turning_points) else None
            segments.append(values[start:end])

        for seg in segments:
            if len(seg) < 2:
                continue
            x = list(range(len(seg)))
            try:
                a, b = np.polyfit(x, seg, 1)
                trend = "increasing" if a > 0 else "decreasing" if a < 0 else "flat"
                print(f"Segment trend: {trend}, slope: {a:.4f}, intercept: {b:.4f}")
            except Exception as e:
                print(f"Trend analysis failed: {e}")


def plot_forecast_vs_actual_plotly(df, metrics, city_filter=None):
    """Interactive Plotly chart: forecast vs actual values for each metric."""
    filtered_df = df.copy()
    if city_filter:
        filtered_df = filtered_df[filtered_df['city'] == city_filter]

    if filtered_df.empty:
        print(f"No data for city: {city_filter}")
        return

    filtered_df['as_of_hour'] = pd.to_datetime(filtered_df['as_of_hour'])
    filtered_df = filtered_df.sort_values('as_of_hour')

    for metric in metrics:
        actual_col = f"{metric}_actual"
        forecast_col = f"{metric}_forecast"
        df_metric = filtered_df[['as_of_hour', actual_col, forecast_col]].dropna()

        if df_metric.empty:
            print(f"No data for metric: {metric}")
            continue

        fig = go.Figure([
            go.Scatter(x=df_metric['as_of_hour'], y=df_metric[actual_col], mode='lines+markers', name=f"Actual {metric}"),
            go.Scatter(x=df_metric['as_of_hour'], y=df_metric[forecast_col], mode='lines+markers', name=f"Forecast {metric}")
        ])

        fig.update_layout(
            title=f"{metric.capitalize()} Forecast vs Actual{' - ' + city_filter if city_filter else ''}",
            xaxis_title="Time",
            yaxis_title=metric.capitalize(),
            template="plotly_white",
            legend=dict(x=0, y=1)
        )
        fig.show()


In [5]:
# Load data
forecast_sql = """
    SELECT wf.*
    FROM reporting_data.weather_forecast wf
    INNER JOIN (
        SELECT city, forecast_for, MAX(prediction_generated_at) AS max_generated
        FROM reporting_data.weather_forecast
        GROUP BY city, forecast_for
    ) latest
    ON wf.city = latest.city
    AND wf.forecast_for = latest.forecast_for
    AND wf.prediction_generated_at = latest.max_generated
    ORDER BY wf.city, wf.forecast_for
"""
forecast_df = pd.read_sql(forecast_sql, engine)
current_df = pd.read_sql("SELECT * FROM reporting_data.weather_current", engine)

# Convert timestamps to datetime
forecast_df["forecast_for"] = pd.to_datetime(forecast_df["forecast_for"])
forecast_df["forecast_for_hour"] = forecast_df["forecast_for"].dt.floor("h")

current_df["as_of"] = pd.to_datetime(current_df["as_of"])

# Filter actual observations to full hours only
current_df = current_df[current_df["as_of"].dt.minute == 0]
current_df["as_of_hour"] = current_df["as_of"]

# Merge actual and forecast data on city and hour
df = pd.merge(
    current_df,
    forecast_df,
    left_on=["city", "as_of_hour"],
    right_on=["city", "forecast_for_hour"],
    suffixes=("_actual", "_forecast")
)

# Define metrics to compare
metrics = ['temp_c', 'wind_kph', 'humidity', 'precip_mm', 'pressure_mb', 'cloud', 'feelslike_c']

# Compute deltas between forecast and actual values
for metric in metrics:
    df[f"delta_{metric}"] = df[f"{metric}_forecast"] - df[f"{metric}_actual"]


In [6]:
df.head(2)

Unnamed: 0,id_actual,city,as_of,temp_c_actual,temp_f_actual,is_day_actual,condition_text_actual,condition_icon_actual,condition_code_actual,wind_mph_actual,...,prediction_generated_at,fetched_at_forecast,forecast_for_hour,delta_temp_c,delta_wind_kph,delta_humidity,delta_precip_mm,delta_pressure_mb,delta_cloud,delta_feelslike_c
0,17,Prague,2025-07-12 16:00:00,18.1,64.6,1,Light drizzle,//cdn.weatherapi.com/weather/64x64/day/266.png,1153,3.4,...,2025-07-12 20:00:05.983759,2025-07-12 20:00:05.979629,2025-07-12 16:00:00,-3.6,2.5,14,0.48,1.0,77,-3.8
1,18,London,2025-07-12 16:00:00,30.4,86.7,1,Sunny,//cdn.weatherapi.com/weather/64x64/day/113.png,1000,10.5,...,2025-07-12 20:00:08.387602,2025-07-12 20:00:08.386310,2025-07-12 16:00:00,-1.5,1.5,-3,0.0,-1.0,8,-1.5


In [7]:
# Errors
metrics_to_compare = ['temp_c', 'wind_kph', 'humidity', 'precip_mm',
                      'pressure_mb', 'cloud', 'feelslike_c']
comparison_df, summary = compute_errors(df, metrics_to_compare)
pd.DataFrame(summary).T



Unnamed: 0,mae,rmse
temp_c,1.9,2.200175
wind_kph,1.053846,1.973868
humidity,8.230769,11.458957
precip_mm,0.099231,0.188292
pressure_mb,0.384615,0.620174
cloud,17.923077,31.024804
feelslike_c,2.169231,2.302173


####  Forecast Error Summary

This summary evaluates the accuracy of weather forecasts compared to actual measured data. Two key error metrics are used:

- **MAE (Mean Absolute Error):** Represents the average size of prediction errors, regardless of direction.
- **RMSE (Root Mean Squared Error):** Penalizes larger errors more heavily and emphasizes outliers.

Lower values indicate more accurate predictions. In general, temperature, wind speed, and pressure forecasts are reasonably accurate. Larger discrepancies are seen in cloud cover and humidity, which are more variable and harder to predict precisely.


In [10]:
metrics_to_compare = ['temp_c', 'wind_kph', 'humidity', 'precip_mm']
plot_forecast_vs_actual_plotly(df, metrics_to_compare, city_filter='Prague')


In [11]:
metrics_to_compare = ['temp_c', 'wind_kph', 'humidity', 'precip_mm']
plot_forecast_vs_actual_plotly(df, metrics_to_compare, city_filter='London')

In [15]:
analyze_forecast_error_trends(df, metrics_to_compare, city_filter="Praha")