Forecast the Airlines Passengers data set. Prepare a document for each model explaining how many dummy variables you have created and RMSE value for each model. Finally which model you will use for Forecasting.

In [None]:
#importing libraries
import pandas as pd
import numpy as np
from numpy import sqrt

from pandas import Grouper
from pandas import DataFrame

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from pandas.plotting import lag_plot
import statsmodels.formula.api as smf
from statsmodels.graphics.tsaplots import plot_acf

from statsmodels.tsa.seasonal import seasonal_decompose
## with the help of this we will be able to creat graphs for the dfferent components of time series data 
#like trends, level, sesional components and residual data

from statsmodels.tsa.holtwinters import SimpleExpSmoothing # SES
from statsmodels.tsa.holtwinters import Holt # Holts Exponential Smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
from google.colab import files
uploaded=files.upload()

In [None]:
Airlines=pd.read_excel("Airlines+Data.xlsx")


In [None]:
Airlines

#EDA

In [None]:
Airlines.shape

In [None]:
Airlines.info()

In [None]:
Airlines.describe()

In [None]:
Airlines.isnull().sum().sum()
## no nan values in the data set


In [None]:
Airlines[Airlines.duplicated()].shape

In [None]:
Airlines[Airlines.duplicated()] #no duplicate values in dataset

In [None]:
Airlines.set_index('Month',inplace=True)
## making the month column as index

# Visualization

In [None]:
Airlines.plot()  #Showing multivariate seasonalaty

In [None]:
##Histogram 

Airlines.hist()
plt.show()

In [None]:
##Density plot
Airlines.plot(kind='kde')
plt.show()

In [None]:
##Lag_plot
lag_plot(Airlines)
plt.show()

In [None]:
#Autocorrelation Plot

plot_acf(Airlines,lags=30)
plt.show()

# UpSampling

In [None]:
upsampled = Airlines.resample('M').mean()
print(upsampled.head(32))

In [None]:
interpolated = upsampled.interpolate(method='linear') ## interplation was done for nan values which we get after doing upsampling by month
print(interpolated.head(15))
interpolated.plot()
plt.show()

In [None]:
interpolated

# Tranformations

In [None]:
# line plot
plt.subplot(211)
plt.plot(Airlines)

In [None]:
# histogram
plt.subplot(212)
plt.hist(Airlines)
plt.show()

# Square Root Transform

In [None]:
dataframe = DataFrame(Airlines.values)
dataframe.columns = ['Passengers']
dataframe['Passengers'] = sqrt(dataframe['Passengers'])

In [None]:
# line plot
plt.subplot(211)
plt.plot(Airlines['Passengers'])
# histogram
plt.subplot(212)
plt.hist(Airlines['Passengers'])
plt.show()

# Log Transform

In [None]:
from numpy import log
## importing the log library

In [None]:
dataframe = DataFrame(Airlines.values)
dataframe.columns = ['Passengers']
dataframe['Passengers'] = log(dataframe['Passengers'])

# line plot
plt.subplot(211)
plt.plot(dataframe['Passengers'])
# histogram
plt.subplot(212)
plt.hist(dataframe['Passengers'])
plt.show()

In [None]:
Train = interpolated.head(81)
Test = interpolated.tail(15)

# Moving Average

In [None]:
plt.figure(figsize=(12,4))
interpolated.Passengers.plot(label="org")
for i in range(2,24,6):
    interpolated["Passengers"].rolling(i).mean().plot(label=str(i))
plt.legend(loc='best')

# Time series decomposition plot

In [None]:
decompose_ts_add = seasonal_decompose(interpolated.Passengers)  
decompose_ts_add.plot()
plt.show()

# ACF plots and PACF plots

In [None]:
import statsmodels.graphics.tsaplots as tsa_plots

tsa_plots.plot_acf(interpolated.Passengers,lags=14)
tsa_plots.plot_pacf(interpolated.Passengers,lags=14)
plt.show()

# Evaluation Metric MAPE


In [None]:
def MAPE(pred,org):
    temp = np.abs((pred-org)/org)*100
    return np.mean(temp)

# Simple Exponential Method

In [None]:
ses_model = SimpleExpSmoothing(Train["Passengers"]).fit(smoothing_level=0.2)
pred_ses = ses_model.predict(start = Test.index[0],end = Test.index[-1])
MAPE(pred_ses,Test.Passengers)

# Holt method

In [None]:
# Holt method 
hw_model = Holt(Train["Passengers"]).fit(smoothing_level=0.1, smoothing_slope=0.2)
pred_hw = hw_model.predict(start = Test.index[0],end = Test.index[-1])
MAPE(pred_hw,Test.Passengers)

# Holts winter exponential smoothing with additive seasonality and additive trend


In [None]:
hwe_model_add_add = ExponentialSmoothing(Train["Passengers"],seasonal="add",trend="add",seasonal_periods=12).fit(smoothing_level=0.1, smoothing_slope=0.2) #add the trend to the model
pred_hwe_add_add = hwe_model_add_add.predict(start = Test.index[0],end = Test.index[-1])
MAPE(pred_hwe_add_add,Test.Passengers)

# Holts winter exponential smoothing with multiplicative seasonality and additive trend

In [None]:
hwe_model_mul_add = ExponentialSmoothing(Train["Passengers"],seasonal="mul",trend="add",seasonal_periods=12).fit(smoothing_level=0.1, smoothing_slope=0.2) 
pred_hwe_mul_add = hwe_model_mul_add.predict(start = Test.index[0],end = Test.index[-1])
MAPE(pred_hwe_mul_add,Test.Passengers)

In [None]:
rmse_hwe_mul_add = sqrt(mean_squared_error(pred_hwe_mul_add,Test.Passengers))
rmse_hwe_mul_add

# Final Model by combining train and test

In [None]:
hwe_model_add_add = ExponentialSmoothing(interpolated["Passengers"],seasonal="add",trend="add",seasonal_periods=10).fit()


In [None]:
#Forecasting for next 10 time periods
hwe_model_add_add.forecast(10)

In [None]:
interpolated

In [None]:
interpolated.reset_index(inplace=True)

In [None]:
interpolated['t'] = 1

In [None]:
interpolated

In [None]:
for i,row in interpolated.iterrows():
  interpolated['t'].iloc[i] = i+1

In [None]:
interpolated

In [None]:
interpolated['t_sq'] = (interpolated['t'])**2
## inserted t_sq column with values

In [None]:
interpolated

In [None]:
interpolated["month"] = interpolated.Month.dt.strftime("%b") # month extraction
interpolated["year"] = interpolated.Month.dt.strftime("%Y") # month extraction

In [None]:
interpolated

In [None]:
months = pd.get_dummies(interpolated['month']) ## converting the dummy variables for month column

In [None]:
months

In [None]:
months = months[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]
## storing the months as serial wise again in months variable

In [None]:
Airlines1 = pd.concat([interpolated,months],axis=1)

In [None]:
Airlines1

In [None]:
Airlines1['log_passengers'] = np.log(Airlines1['Passengers'])

In [None]:
Airlines1

In [None]:
plt.figure(figsize=(12,8))
heatmap_y_month = pd.pivot_table(data=Airlines1,values="Passengers",index="year",columns="month",aggfunc="mean",fill_value=0)
sns.heatmap(heatmap_y_month,annot=True,fmt="g")

In [None]:
# Boxplot 
plt.figure(figsize=(8,6))
plt.subplot(211)
sns.boxplot(x="month",y="Passengers",data= Airlines1)
plt.subplot(212)
sns.boxplot(x="year",y="Passengers",data=Airlines1)


In [None]:
plt.figure(figsize=(12,3))
sns.lineplot(x="year",y="Passengers",data=Airlines1)

# Splitting data

In [None]:
Train = Airlines1.head(81) # training data
Test = Airlines1.tail(15) # test Data

In [None]:
#Linear Model
import statsmodels.formula.api as smf 

linear_model = smf.ols('Passengers~t',data=Train).fit()
pred_linear =  pd.Series(linear_model.predict(pd.DataFrame(Test['t'])))
rmse_linear = np.sqrt(np.mean((np.array(Test['Passengers'])-np.array(pred_linear))**2))
rmse_linear

In [None]:
#Exponential
Exp = smf.ols('log_passengers~t',data=Train).fit()
pred_Exp = pd.Series(Exp.predict(pd.DataFrame(Test['t'])))
rmse_Exp = np.sqrt(np.mean((np.array(Test['Passengers'])-np.array(np.exp(pred_Exp)))**2))
rmse_Exp

In [None]:
#Quadratic 

Quad = smf.ols('Passengers~t+t_sq',data=Train).fit()
pred_Quad = pd.Series(Quad.predict(Test[["t","t_sq"]]))
rmse_Quad = np.sqrt(np.mean((np.array(Test['Passengers'])-np.array(pred_Quad))**2))
rmse_Quad

In [None]:
#Additive seasonality 

add_sea = smf.ols('Passengers~Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data=Train).fit()
pred_add_sea = pd.Series(add_sea.predict(Test[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov']]))
rmse_add_sea = np.sqrt(np.mean((np.array(Test['Passengers'])-np.array(pred_add_sea))**2))
rmse_add_sea

In [None]:
#Additive Seasonality Quadratic 

add_sea_Quad = smf.ols('Passengers~t+t_sq+Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data=Train).fit()
pred_add_sea_quad = pd.Series(add_sea_Quad.predict(Test[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','t','t_sq']]))
rmse_add_sea_quad = np.sqrt(np.mean((np.array(Test['Passengers'])-np.array(pred_add_sea_quad))**2))
rmse_add_sea_quad

In [None]:
##Multiplicative Seasonality

Mul_sea = smf.ols('log_passengers~Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data = Train).fit()
pred_Mult_sea = pd.Series(Mul_sea.predict(Test))
rmse_Mult_sea = np.sqrt(np.mean((np.array(Test['Passengers'])-np.array(np.exp(pred_Mult_sea)))**2))
rmse_Mult_sea

In [None]:
#Multiplicative Additive Seasonality 

Mul_Add_sea = smf.ols('log_passengers~t+Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data = Train).fit()
pred_Mult_add_sea = pd.Series(Mul_Add_sea.predict(Test))
rmse_Mult_add_sea = np.sqrt(np.mean((np.array(Test['Passengers'])-np.array(np.exp(pred_Mult_add_sea)))**2))
rmse_Mult_add_sea

In [None]:
#Compareing the results 

data = {"MODEL":pd.Series(["rmse_linear","rmse_Exp","rmse_Quad","rmse_add_sea","rmse_add_sea_quad","rmse_Mult_sea","rmse_Mult_add_sea"]),"RMSE_Values":pd.Series([rmse_linear,rmse_Exp,rmse_Quad,rmse_add_sea,rmse_add_sea_quad,rmse_Mult_sea,rmse_Mult_add_sea])}
table_rmse=pd.DataFrame(data)
table_rmse.sort_values(['RMSE_Values'])

Multiplicative Additive Seasonality has least RMSE value so this model will be best.