In [None]:
import pandas as pd
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

In [None]:
df= pd.read_csv('G:/python_time_serise/1test.csv')
df.head()

In [None]:
df['Date'] = pd.to_datetime(df[['year', 'month', 'day']])
df.head()

In [None]:
earliest_date = df['Date'].min()
latest_date = df['Date'].max()

print(f"Earliest date: {earliest_date}")
print(f"Latest date: {latest_date}")

In [None]:
num_rows, num_cols = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

In [None]:
def run_adf_test(series, name):
    result = adfuller(series.dropna())
    print(f"ADF Test Results for {name}:")
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    print("Critical Values:")
    for key, value in result[4].items():
        print(f"\t{key}: {value}")
    print("\n")

###ADF Test Results for A2 
run_adf_test(df['A2'], 'A2')


# SARIMA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima

In [None]:
df['A2'].plot(figsize=(12, 6))
plt.title('Time Series Data')
plt.show()

In [None]:
def adf_test(series, title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(), autolag='AIC')
    labels = ['ADF Test Statistic', 'p-value', '# Lags Used', 'Number of Observations Used']
    out = pd.Series(result[0:4], index=labels)
    for key, val in result[4].items():
        out[f'Critical Value ({key})'] = val
    print(out.to_string())
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

for column in ['A2']:
    adf_test(df[column], title=f'{column}')


In [None]:
df_diff = df.diff().dropna()

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox

ljung_box_test = acorr_ljungbox(df['A2'], lags=[12], return_df=True)
print(ljung_box_test)

In [None]:
def find_best_sarima(series, seasonal_period=12):
    model = auto_arima(series, start_p=0, start_q=0, max_p=5, max_q=5,
                       start_P=0, max_P=2, start_Q=0, max_Q=2, m=seasonal_period,
                       seasonal=True, d=1, D=1, trace=True,
                       error_action='ignore', suppress_warnings=True, stepwise=True)
    return model.order, model.seasonal_order

df.set_index('Date', inplace=True)

for col in ['A2']:
    best_order, best_seasonal_order = find_best_sarima(df[col])
    print(f"Best (p,d,q) for {col}: {best_order}")
    print(f"Best seasonal (P,D,Q,s) for {col}: {best_seasonal_order}")


In [None]:
# Example parameters (p, d, q) x (P, D, Q, s)
model = SARIMAX(df['A2'], order=(1, 1, 0), seasonal_order=(2, 1, 1, 12))
results = model.fit()
print(results.summary())


In [None]:
tscv = TimeSeriesSplit(n_splits=5)
mse_scores = []

for train_index, test_index in tscv.split(df['A2']):
    train, test = df['A2'].iloc[train_index], df['A2'].iloc[test_index]
    model = SARIMAX(train, order=best_order, seasonal_order=best_seasonal_order)
    model_fit = model.fit(disp=False)
    forecast = model_fit.forecast(steps=len(test))
    mse = mean_squared_error(test, forecast)
    mse_scores.append(mse)

print("MSE scores:", mse_scores)
print("Average MSE:", np.mean(mse_scores))

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima

# Fit the SARIMA model
model = SARIMAX(df['A2'].dropna(), order=best_order, seasonal_order=best_seasonal_order)
model_fit = model.fit(disp=False)


forecast_steps = 12  
forecast_90 = model_fit.get_forecast(steps=forecast_steps)
forecast_95 = model_fit.get_forecast(steps=forecast_steps)
forecast_98 = model_fit.get_forecast(steps=forecast_steps)

ci_90 = forecast_90.conf_int(alpha=0.10)
ci_95 = forecast_95.conf_int(alpha=0.05)
ci_98 = forecast_98.conf_int(alpha=0.02)

last_observed_date = df.index[-1]

# Create forecast dates
forecast_dates = pd.date_range(start=last_observed_date + pd.Timedelta(days=1), periods=forecast_steps, freq='D')

confidence_levels = ['90%', '95%', '98%']
forecast_values = [
    forecast_90.predicted_mean.values,
    forecast_95.predicted_mean.values,
    forecast_98.predicted_mean.values
]

# Create the table data
table_data = {
    'Date': [last_observed_date.strftime('%Y-%m-%d')] + [date.strftime('%Y-%m-%d') for date in forecast_dates for _ in confidence_levels],
    'Confidence Level': ['Last Observed'] + confidence_levels * forecast_steps,
    'Value': [df['A2'].iloc[-1]] + forecast_values[0].tolist() + forecast_values[1].tolist() + forecast_values[2].tolist()
}

table = pd.DataFrame(table_data)
print(table)


In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['A2'], label='Observed')
plt.plot(forecast_dates, forecast_90.predicted_mean, label='Forecast (90% CI)')
plt.fill_between(forecast_dates, ci_90.iloc[:, 0], ci_90.iloc[:, 1], color='k', alpha=0.1)
plt.plot(forecast_dates, forecast_95.predicted_mean, label='Forecast (95% CI)', linestyle='--')
plt.fill_between(forecast_dates, ci_95.iloc[:, 0], ci_95.iloc[:, 1], color='g', alpha=0.1)
plt.plot(forecast_dates, forecast_98.predicted_mean, label='Forecast (98% CI)', linestyle=':')
plt.fill_between(forecast_dates, ci_98.iloc[:, 0], ci_98.iloc[:, 1], color='r', alpha=0.1)
plt.xlabel('Date')
plt.ylabel('A2 Values')
plt.title('SARIMA Forecast with Confidence Intervals')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['A2'], label='Observed')
plt.plot(forecast_dates, forecast_90.predicted_mean, label='Forecast (90% CI)')
plt.fill_between(forecast_dates, ci_90.iloc[:, 0], ci_90.iloc[:, 1], color='k', alpha=0.1)
plt.xlabel('Date')
plt.ylabel('A2 Values')
plt.title('SARIMA Forecast with 90% Confidence Interval')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(df.index, df['A2'], label='Observed')
plt.plot(forecast_dates, forecast_95.predicted_mean, label='Forecast (95% CI)', linestyle='--')
plt.fill_between(forecast_dates, ci_95.iloc[:, 0], ci_95.iloc[:, 1], color='g', alpha=0.1)
plt.xlabel('Date')
plt.ylabel('A2 Values')
plt.title('SARIMA Forecast with 95% Confidence Interval')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(df.index, df['A2'], label='Observed')
plt.plot(forecast_dates, forecast_98.predicted_mean, label='Forecast (98% CI)', linestyle=':')
plt.fill_between(forecast_dates, ci_98.iloc[:, 0], ci_98.iloc[:, 1], color='r', alpha=0.1)
plt.xlabel('Date')
plt.ylabel('A2 Values')
plt.title('SARIMA Forecast with 98% Confidence Interval')
plt.legend()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima

forecast_steps = 30  # Number of steps to forecast
forecast_98 = model_fit.get_forecast(steps=forecast_steps)

ci_98 = forecast_98.conf_int(alpha=0.02)

last_observed_date = df.index[-1]

forecast_dates = pd.date_range(start=last_observed_date + pd.Timedelta(days=1), periods=forecast_steps, freq='D')

table_data = {
    'Date': forecast_dates.strftime('%Y-%m-%d'),
    'Forecasted Value': forecast_98.predicted_mean.values,
    'Lower 98% CI': ci_98.iloc[:, 0],
    'Upper 98% CI': ci_98.iloc[:, 1]
}

table = pd.DataFrame(table_data)
print(table)


In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['A2'], label='Observed', color='blue')
plt.plot(forecast_dates, forecast_98.predicted_mean, label='Forecast (98% CI)', color='orange')
plt.fill_between(forecast_dates, ci_98.iloc[:, 0], ci_98.iloc[:, 1], color='r', alpha=0.1)
plt.xlabel('Date')
plt.ylabel('A2 Values')
plt.title('SARIMA Forecast with 98% Confidence Interval')
plt.legend()
plt.show()