In [None]:
import os

import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from statsmodels.tsa.stattools import adfuller
from datetime import date

In [None]:
os.chdir("..")
os.getcwd()

# Load data

In [None]:
origin_df = pd.read_csv("resources/weather_prediction_dataset.csv")
base_columns = ["DATE", "MONTH"]
oslo_columns = [x for x in origin_df.columns if x.startswith("OSLO")]
columns = base_columns + oslo_columns
origin_df = origin_df[columns][:-1]

In [None]:
origin_df.describe()

In [None]:
origin_df.info()

In [None]:
origin_df.head()

# Basic preprocessing

In [None]:
origin_df["YEAR"] = origin_df["DATE"].apply(lambda x: int(str(x)[:4]))
origin_df["DAY"] = origin_df["DATE"].apply(lambda x: int(str(x)[-2:]))
origin_df["DATE"] = [
    date(year=origin_df['YEAR'].iloc[i], month=origin_df['MONTH'].iloc[i], day=origin_df['DAY'].iloc[i])
    for i in range(len(origin_df))
]

origin_df.head()

# Basic visualization

In [None]:
def basic_time_plot(df: pd.DataFrame, col: str):
    fig = px.line(
        df,
        x="DATE",
        y=col,
        title=col,
        height=300
    )
    fig.show()

In [None]:
"""You can observe each column"""

basic_time_plot(origin_df, "OSLO_temp_mean")
# basic_time_plot(origin_df, "OSLO_temp_min")
# basic_time_plot(origin_df, "OSLO_temp_max")

# Stationarity
- diff = Mean monthly value - current value
- it lloks like an additive model: monthly temperature + diff

In [None]:
stationary_df = origin_df.copy()

In [None]:
for col in oslo_columns:
    mean_values = origin_df.groupby('MONTH').agg({col: "mean"}).to_dict()[col]
    raw_values = stationary_df[col].to_list()
    new_values = []
    for i, x in enumerate(raw_values):
        month = stationary_df.iloc[i]["MONTH"]
        new_values.append(x - mean_values[month])
    stationary_df[col] = new_values

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scattergl(
        name="Raw temp",
        x=origin_df['DATE'],
        y=origin_df['OSLO_temp_mean'],
        mode='lines'
    )
)
fig.add_trace(
    go.Scattergl(
        name="Monthly temp - raw temp",
        x=stationary_df['DATE'],
        y=stationary_df['OSLO_temp_mean'],
        mode='lines'
    )
)

fig.show()

In [None]:
print("Is stationary? ", adfuller(stationary_df['OSLO_temp_mean'])[1] < 0.05)

In [None]:
stationary_df.to_csv("resources/weather_prediction_stationary_dataset.csv")

# Trend (per year)
- no regular trend

In [None]:
fig = px.line(
    origin_df.groupby("YEAR").agg({"OSLO_temp_mean": "median"}).reset_index(),
    x="YEAR",
    y="OSLO_temp_mean",
    width=800
).show()

# ML task (see next notebooks)
**predict mean temperature in 2009 for each 1 week**


## Strategies using only one feature (temp mean)
- moving mean/median with different time lags


## Strategies using multiple features in 1-dimensional vector: regression models
- linear regression
- decision tree
- ensemble (random forest, gradient boost, xgboost, lgbm)


## Strategies using multiple features in multiple-dimensional vector
- recurrent nn
- lstm nn