In [None]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

from datetime import date, timedelta

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
os.chdir("..")
os.getcwd()

# Load data

In [None]:
origin_df = pd.read_csv("./resources/weather_prediction_dataset.csv")
base_columns = ["DATE", "MONTH"]
oslo_columns = [x for x in origin_df.columns if x.startswith("OSLO")]
columns = base_columns + oslo_columns
origin_df = origin_df[columns][:-1]

origin_df["YEAR"] = origin_df["DATE"].apply(lambda x: int(str(x)[:4]))
origin_df["DAY"] = origin_df["DATE"].apply(lambda x: int(str(x)[-2:]))
origin_df["DATE"] = [
    date(year=origin_df['YEAR'].iloc[i], month=origin_df['MONTH'].iloc[i], day=origin_df['DAY'].iloc[i])
    for i in range(len(origin_df))
]

origin_df.head()

In [None]:
stationary_df = pd.read_csv("resources/weather_prediction_stationary_dataset.csv")

# Prepare dataset

In [None]:
monthly_temp = origin_df.groupby('MONTH').agg({"OSLO_temp_mean": "mean"}).to_dict()["OSLO_temp_mean"]
monthly_temp

In [None]:
temperature_df = origin_df[["DATE", "OSLO_temp_mean"]]
temperature_df.rename(columns={"DATE": "date", "OSLO_temp_mean": "raw_temp"}, inplace=True)
temperature_df["stationary_temp"] = stationary_df["OSLO_temp_mean"]
temperature_df

In [None]:
for i in range(1, 8):
    temperature_df[f"stationary_temp_lag{i}"] = temperature_df["stationary_temp"].shift(i)

temperature_df.head(10)

# Implement DecisionTree
- predict mean temp for each week
- starting point 2000-01-08

In [None]:
def train_model(current_df: pd.DataFrame, lag_cols: list[str]):
    """Train model on the data without last 7 days"""
    x = current_df[lag_cols].to_numpy()[:-7]
    y = current_df["stationary_temp"].to_numpy()[:-7]
    return DecisionTreeRegressor().fit(x, y)
    

def predict(current_df: pd.DataFrame, lag_cols: list[str], model, day: int) -> float:
    """Predict on the last X row"""
    X = current_df[lag_cols].to_numpy()[-7+day].reshape(1, -1)
    return model.predict(X)[0]


def update_current_df(day: int, lag_cols: list[str], current_df: pd.DataFrame, preds: list[float]) -> pd.DataFrame:
    for i in range(day+1):
        if i < len(lag_cols):
            try:
                lag_values = current_df[lag_cols[i]].to_list()
            except IndexError as e:
                print(i)
                print(lag_cols)
                raise e
            
            try:
                lag_values[-7-day] = preds[-(i+1)]
            except IndexError as e:
                break
            
            current_df[lag_cols[i]] = lag_values
    return current_df


def show_predictions(targets, preds, mae, mse, lag):
    fig = go.Figure()

    fig.add_trace(
        go.Scattergl(
            name="Real",
            x=[x for x in range(len(targets))],
            y=targets,
            mode="lines"
        )
    )
    fig.add_trace(
        go.Scattergl(
            name="7-days predictions",
            x=[x for x in range(len(targets))],
            y=preds,
            mode="lines"
        )
    )

    fig.update_layout(
        title=f"<b>7 days predictions in 2009</b><br>Lag days = {lag}<br>mae={round(mae, 2)}, mse={round(mse, 2)}",
        height=500,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=0.91,
            xanchor="left",
            x=0
        ),
        xaxis_title="Day",
        yaxis_title="Temp [C]"
    )

    fig.show()

In [None]:
origin_lag_cols = ["stationary_temp_lag1",	"stationary_temp_lag2",	"stationary_temp_lag3",	"stationary_temp_lag4",	"stationary_temp_lag5",	"stationary_temp_lag6",	"stationary_temp_lag7"]
mae_data, mse_data = {}, {}

for lag in range(2, 8):
    targets, preds, months = [], [], []
    lag_cols = origin_lag_cols[:lag]
    finish_date = date(year=2009, month=1, day=1)


    while True: 
        current_df = (
            temperature_df
            [temperature_df["date"] < finish_date]
            [["date", "raw_temp", "stationary_temp"] + lag_cols]
        )
        if len(temperature_df) - len(current_df) >= 7:
            model = train_model(current_df, lag_cols)
            targets += current_df["stationary_temp"].to_list()[-7:]
            months += [x.month for x in current_df["date"].to_list()[-7:]]
            
            for day in range(7):
                preds.append(predict(current_df, lag_cols, model, day))
                current_df = update_current_df(day, lag_cols, current_df, preds)
        else:
            break
        
        finish_date += timedelta(days=7)


    assert len(preds) == len(targets)
    assert len(months) == len(preds)


    targets = [monthly_temp[months[i]] + x for i, x in enumerate(targets)]
    preds = [monthly_temp[months[i]] + x for i, x in enumerate(preds)]
    
    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)

    mae_data[lag] = mae
    mse_data[lag] = mse
    
    show_predictions(targets, preds, mae, mse, lag)

In [None]:
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("mae", "mse")
)

fig.add_trace(
    go.Bar(
        x=list(mae_data.keys()),
        y=list(mae_data.values()),
        text=[round(x, 2) for x in mae_data.values()],
        marker_color="teal",
        showlegend=False
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Bar(
        x=list(mse_data.keys()),
        y=list(mse_data.values()),
        text=[round(x, 2) for x in mse_data.values()],
        marker_color="teal",
        showlegend=False
    ),
    row=1,
    col=2
)

fig.update_layout(
    title="Error metrics",
    width=1200,
    height=500,
    xaxis_title="Lag days",
    xaxis2_title="Lag days",
    yaxis=dict(range=(0, max(list(mae_data.values()) + list(mse_data.values()))+1), title="error"),
    yaxis2=dict(range=(0, max(list(mae_data.values()) + list(mse_data.values()))+1), title="error"),
)

fig.show()