In [1]:
import pandas as pd
%matplotlib inline
import cufflinks as cf
from statsmodels.tsa.stattools import acf, pacf, kpss
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

cf.go_offline()

In [2]:
entire_df = pd.read_csv('target-hospital-admissions.csv')
US_df = entire_df[entire_df["location_name"] == "US"]
US_df = US_df[['date','value']] # Since all the values are in the US, only need date and value columns
US_df['date'] = pd.to_datetime(US_df['date']) # Ensuring that the strings in date column are DateTime objects
US_df.set_index('date', inplace=True) # Eliminate the leading entry column by setting date as the index column
US_df

Unnamed: 0_level_0,value
date,Unnamed: 1_level_1
2024-04-27,2337
2024-04-20,2860
2024-04-13,3957
2024-04-06,4951
2024-03-30,5445
...,...
2022-03-12,2223
2022-03-05,1889
2022-02-26,1669
2022-02-19,1512


In [3]:
US_df["value"].iplot(kind = 'line', xTitle = "Date", yTitle = "Count", title = "Count of Influenza Cases in US Over Time")

In [4]:
training_df = US_df.loc["2023-11-11": "2023-12-02"]
training_df

Unnamed: 0_level_0,value
date,Unnamed: 1_level_1
2023-12-02,5752
2023-11-25,4240
2023-11-18,3422
2023-11-11,2695


In [5]:
p_values = [1, 2, 3, 4]
d_values = [0, 1]
q_values = [1, 2, 3, 4]

# index = ["2023-12-09", "2023-12-16", "2023-12-23", "2023-12-30"]

In [26]:
order_df = pd.DataFrame(columns=["AIC", "FIT MSE", "FIT MAE", "FIT MAPE", "FORECAST MSE", "FORECAST MAE", "FORECAST MAPE"])

real_forecast = US_df["value"].loc["2023-12-09": "2023-12-30"]

real_training_data = training_df["value"]

import warnings
warnings.filterwarnings("ignore")

for p in p_values:
    for d in d_values:
        for q in q_values:
            index = f"({p},{d},{q})"
            
            #Creating model based on looped p, d, or q
            model = ARIMA(training_df["value"], order = (p, d, q))
            fit = model.fit()
            
            #Predicting for fit MSE, MAE, MAPE
            
            train_pred = fit.predict(start=1, end=4, typ='levels')
            #fit_mse = mean_squared_error(real_training_data[1:], train_pred[1:])
            
            #Forecasting for forecast MSE, MAE, MAPE
            forecast = fit.forecast(steps = 4)
            new_index = ["2023-12-09", "2023-12-16", "2023-12-23", "2023-12-30"]
            forecast.index = new_index # Changing index and ensuring it is DateTime object
            forecast.index = pd.to_datetime(forecast.index)
            
            
            data = {"AIC": fit.aic, "FORECAST MSE": mean_squared_error(y_true = real_forecast,y_pred = forecast), "FORECAST MAE": mean_absolute_error(y_true=real_forecast, y_pred=forecast), "FORECAST MAPE": mean_absolute_percentage_error(y_true=real_forecast, y_pred=forecast),
                   "FIT MSE": mean_squared_error(y_true=real_training_data[1:], y_pred=train_pred[1:]), "FIT MAE": mean_absolute_error(y_true=real_training_data[1:], y_pred=train_pred[1:]), "FIT MAPE": mean_absolute_percentage_error(y_true=real_training_data[1:], y_pred=train_pred[1:])
                   }
            
            order_df.loc[index] = data

In [27]:
order_df

Unnamed: 0,AIC,FIT MSE,FIT MAE,FIT MAPE,FORECAST MSE,FORECAST MAE,FORECAST MAPE
"(1,0,1)",74.545302,124012.5,291.349086,0.078266,129631600.0,9869.35521,0.683319
"(1,0,2)",76.536316,506181.1,674.786356,0.207561,115066300.0,9209.842691,0.632448
"(1,0,3)",77.19307,481158.5,612.120083,0.192642,115244900.0,9187.257072,0.627803
"(1,0,4)",78.924232,396602.8,588.19722,0.174205,118808600.0,9297.745597,0.632777
"(1,1,1)",56.446298,10.81638,2.643119,0.000685,140664900.0,10611.257476,0.761351
"(1,1,2)",57.660991,343678.0,473.685767,0.160321,124077600.0,9817.244102,0.692282
"(1,1,3)",57.958663,550564.6,565.786516,0.195924,121344700.0,9716.556541,0.686753
"(1,1,4)",67.963154,7136052.0,2193.566253,0.656863,234491800.0,14609.472458,1.139708
"(2,0,1)",72.946344,560074.8,604.41921,0.157226,123638100.0,9052.740306,0.581762
"(2,0,2)",75.137768,842544.4,857.731779,0.241024,112667500.0,8625.061708,0.557593


In [31]:
#Prebuilt dfs to sort values
aic_sorted_df = order_df.sort_values(by="AIC")
fit_mse_sorted_df = order_df.sort_values(by="FIT MSE")
fit_mae_sorted_df = order_df.sort_values(by="FIT MAE")
fit_mape_sorted_df = order_df.sort_values(by="FIT MAPE")
forecast_mse_sorted_df = order_df.sort_values(by="FORECAST MSE")
forecast_mae_sorted_df = order_df.sort_values(by="FORECAST MAE")
forecast_mape_sorted_df = order_df.sort_values(by="FORECAST MAPE")


In [11]:
optimized_model = ARIMA(training_df["value"], order=(4,1,3))
optimized_fit = optimized_model.fit()

forecast = optimized_fit.forecast(steps = 4)
new_index = ["2023-12-09", "2023-12-16", "2023-12-23", "2023-12-30"]
forecast.index = new_index # Changing index and ensuring it is DateTime object
forecast.index = pd.to_datetime(forecast.index)

In [12]:
arima_df = pd.DataFrame(index = training_df.index)
new_index = arima_df.index.append(forecast.index)
arima_df = arima_df.reindex(new_index)
arima_df["Training Data"] = training_df.iloc[:, 0]

arima_df["Real Data"] = US_df.loc[forecast.index]
arima_df["ARIMA Prediction"] = forecast
arima_df.sort_index(inplace=True)
arima_df

Unnamed: 0,Training Data,Real Data,ARIMA Prediction
2023-11-11,2695.0,,
2023-11-18,3422.0,,
2023-11-25,4240.0,,
2023-12-02,5752.0,,
2023-12-09,,7178.0,5608.736786
2023-12-16,,9886.0,4318.477894
2023-12-23,,15134.0,3362.458255
2023-12-30,,21030.0,2808.291294


In [13]:
arima_df.iplot(kind="scatter")