In [2]:
# Load Libraries

import os
import sys
import joblib
from datetime import datetime
import glob
import pandas as pd
import numpy as np
import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pmdarima import auto_arima



In [7]:
# Add the 'scripts' folder to the Python path
# scripts_path = os.path.abspath("../scripts")
# if scripts_path not in sys.path:
#     sys.path.append(scripts_path)

# Add the project root (parent of 'scripts') to sys.path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

# Now import your modules from the scripts folder
from scripts.wrangle import wrangle, merge_monthly_files, load_combined_series
from scripts.model_training import train_sarima_model, save_model, load_latest_model, forecast_with_model
from scripts.evaluate import evaluate_forecast, plot_forecast
from scripts.validate import load_latest_wfv_results, main


FileNotFoundError: [Errno 2] No such file or directory: 'config.yaml'

In [1]:
# wrangle data
filepath = "../data/monthly_p2_readings/2023_11.csv"

df = wrangle(filepath=filepath)
df.head()

NameError: name 'wrangle' is not defined

In [None]:
# Cimbine multiple datasets

merge_monthly_files().head()

In [None]:
# Load Combined dataset into dataframe
train, test = load_combined_series()

print(f"Train split: {train.shape}")
print(f"Test split: {test.shape}")

In [None]:
y = train

# PM2.5 Over time
fig, ax = plt.subplots(figsize = (15, 5))
plt.plot(y)
plt.title("PM2.5 Values over Time")
plt.xlabel("Years")
plt.ylabel("PM2.5")
plt.show()

window = 120
# rolling mean for 30 days or 720 hours
rolling_mean = y.rolling(window= window).mean()

# rolling variance for 30 days or 720 hours
rolling_std = y.rolling(window = window).std()

# Plot charts
fig, ax = plt.subplots(figsize = (15, 5))
plt.plot(rolling_mean, label = "Rolling mean 30d")
plt.plot(rolling_std, label = "Rolling std 30d")
plt.title("Rolling Average and Standard Deviation over 30 days or 720 hours")
plt.xlabel("Years")
plt.ylabel("PM2.5")

plt.legend()
plt.show()

# Figure size
plt.figure(figsize= (15, 5))

# Histogram
plt.subplot(1, 2, 1)
sns.histplot(y, kde=True, bins =  30, color= "skyblue")
plt.title("PM2.5 Distribution")
plt.ylabel("Frequency [count]")

# Boxplot
plt.subplot(1, 2, 2)
sns.boxplot(y, color= "lightcoral", orient="h")
plt.title("PM2.5 Boxplot: There appears to be Outliers")
plt.ylabel("Frequency [count]")

plt.show()

period = 120

# Instantiate the seasonal_decompose
decomposition = seasonal_decompose(y, model="additive", period=period)

# Plot the Chart
decomposition.plot()
plt.suptitle("Seasonal Decomposition (Additive Model)", fontsize=14)
plt.tight_layout()
plt.show()

# plot_acf

fig, ax = plt.subplots(figsize=(14, 5))
plot_acf(y, ax=ax)
plt.title("Autocorrelation (ACF)")
plt.tight_layout()
plt.show()

# plot_pacf

fig, ax = plt.subplots(figsize=(14, 5))
plot_pacf(y, ax=ax)
plt.title("Partial Autocorrelation (PACF)")
plt.tight_layout()
plt.show()

# testing the created Scripts

In [None]:
# train model
model = train_sarima_model()

In [None]:
model.summary()

In [None]:
# save_model
save_model(model)

In [None]:
# load model
model = load_latest_model()
print(type(model))

In [None]:
# forecast
y_pred = forecast_with_model(model)
y_pred.head()

In [None]:
# loading latest Walk forward validation file
wfv_df = load_latest_wfv_results()

wfv_df.head()

In [None]:
# Evaluate model

mse, mae = evaluate_forecast(test, wfv_df["y_pred"])

print(f"MSE: {mse}")
print(f"MAE: {mae}")

# plot Evaluation

plot_forecast(test, wfv_df["y_pred"])

In [None]:
# Testing the 'main' pipeline
wfv_df, mse, mae = main(train, test)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

wfv_df.head()