In [13]:
# Type hints
from typing import Tuple

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import seaborn as sn
pd.options.plotting.backend = 'plotly'


# Time series analysis algorithms
from statsmodels.tsa.statespace.sarimax import SARIMAX
from random import random
from prophet import Prophet

# Our data pre-processing
from utils.generate_dataset import *


In [5]:
# dataset = get_dataset(save=False)
dataset = pd.read_csv('data_v/dataset.csv')

In [None]:
dataset.info()

In [None]:
dataset.plot(width=1100)

In [None]:
corrMatrix = dataset.corr()
px.imshow(corrMatrix, title="Correlogram of dataset", width=1100)

In [None]:
temperature_series, precipitation_series, pm25_series, pm10_series = extract_time_series(dataset)

In [6]:
SPLIT_INDEX_TRAIN = 1000
SPLIT_INDEX_TEST = 100

In [14]:
# print(pm25_series_for_prophet.head())
pm25_series_for_prophet = dataset.loc[:, ["Date/time", "PM2.5 [ug/m3]"]]
pm25_series_for_prophet.rename(columns={"Date/time":"ds", "PM2.5 [ug/m3]":"y"}, inplace=True)
df = pd.DataFrame(pm25_series_for_prophet[-SPLIT_INDEX_TRAIN:-SPLIT_INDEX_TEST])
df_display = pd.DataFrame(pm25_series_for_prophet[-SPLIT_INDEX_TRAIN:-(SPLIT_INDEX_TEST-1)])
df_test = pd.DataFrame(pm25_series_for_prophet[-SPLIT_INDEX_TEST:])
# df = pm25_series_for_prophet.tail(100)
df["ds"] = pd.to_datetime(df["ds"])

In [15]:
#remove timezone from column
df["ds"] = df["ds"].dt.tz_localize(None)

In [16]:
#Fit the model to the dataframe
m = Prophet()
m.fit(df)

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.


<prophet.forecaster.Prophet at 0x7fb818ea4b50>

In [17]:
#Helper method creating a placeholder for predictions
future = m.make_future_dataframe(periods=200)[:100]

In [18]:
#Make predictions based on the training above
forecast = m.predict(future)

In [21]:

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(df_display["ds"])[-SPLIT_INDEX_TEST:], y=df_display["y"][-SPLIT_INDEX_TEST:],
                    mode='lines',
                    name='Before value'))
fig.add_trace(go.Scatter(x=list(df_test["ds"]), y=df_test["y"],
                    mode='lines',
                    name='Truth value'))
fig.add_trace(go.Scatter(x=list(df_test["ds"]), y=forecast["yhat"],
                    mode='lines',
                    name='Prediction'))
fig.update_layout(width=1920, height=1000, yaxis_title="PM2.5")

fig.show()


In [24]:
def split_into_train_test_sets(X: np.array) -> Tuple[np.array, np.array]:
    train_size = int(0.9 * len(X))
    test_size = len(X)  - train_size
    

    train_set, test_set  = X[0: train_size], X[train_size: train_size + test_size]
    
    print(f"""
    Total dataset length: {len(X)}
    
    Train set shape: {train_set.shape} ({train_set.shape[0] / len(X) * 100:0.2f}% of dataset)
    Test set shape: {test_set.shape} ({test_set.shape[0] / len(X) * 100:0.2f}% of dataset)
    """)

    return train_set,  test_set

In [25]:
# Clement qui membete a faire des nouvelles fonction
train_set, test_set = split_into_train_test_sets(X=pm25_series.values[-1000:])

model = SARIMAX(train_set, order=(1, 1, 1), seasonal_order=(0, 0, 0, 0), enforce_invertibility=False, enforce_stationarity=False)
model_fit = model.fit(disp=False)


    Total dataset length: 1000
    
    Train set shape: (900,) (90.00% of dataset)
    Test set shape: (100,) (10.00% of dataset)
    


In [None]:
PREDICTION_NUMBER = 100

In [None]:
train_set, test_set = split_into_train_test_sets(X=pm25_series.values[-1000:])
predictions = []
for i in range(PREDICTION_NUMBER):
    model = SARIMAX(train_set, order=(4, 1, 4), seasonal_order=(1, 0, 0, 12), enforce_invertibility=False,enforce_stationarity=False)
    model_fit = model.fit(disp=False)
    yhat = model_fit.forecast(1)
    predictions.append(yhat[0])
    train_set = np.append(train_set, yhat[0])

In [26]:
train_set, test_set = split_into_train_test_sets(X=pm25_series.values[-1000:])
model = SARIMAX(train_set)
model_fit = model.fit(train_set)
yhat = model_fit.forecast(100)


    Total dataset length: 1000
    
    Train set shape: (900,) (90.00% of dataset)
    Test set shape: (100,) (10.00% of dataset)
    

invalid value encountered in sqrt


Maximum Likelihood optimization failed to converge. Check mle_retvals



ValueError: On entry to DLASCL parameter number 4 had an illegal value

In [None]:


fig = go.Figure()
fig.add_trace(go.Scatter(x=list(dataset["Date/time"])[-110:-100], y=train_set,
                    mode='lines',
                    name='Before value'))
fig.add_trace(go.Scatter(x=list(dataset["Date/time"])[-100:], y=test_set,
                    mode='lines',
                    name='Truth value'))
fig.add_trace(go.Scatter(x=list(dataset["Date/time"])[-100:], y=np.array(predictions).flatten(),
                    mode='lines',
                    name='Prediction'))
# fig.add_trace(go.Scatter(x=list(dataset["Date/time"])[-100:], y=yhat[:20],
#                     mode='lines',
#                     name='Prediction auto'))
fig.update_layout(width=1100, yaxis_title="PM2.5")

fig.show()
