# ARIMA VS SARIMA time series modeling

# Load the Data and libraries

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

from matplotlib import rc
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import itertools

In [2]:
data=pd.read_csv('./zillow_data.csv')

In [3]:
data

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,...,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04
0,84654,60657,Chicago,IL,Chicago,Cook,1,334200.0,335400.0,336500.0,...,1005500,1007500,1007800,1009600,1013300,1018700,1024400,1030700,1033800,1030600
1,90668,75070,McKinney,TX,Dallas-Fort Worth,Collin,2,235700.0,236900.0,236700.0,...,308000,310000,312500,314100,315000,316600,318100,319600,321100,321800
2,91982,77494,Katy,TX,Houston,Harris,3,210400.0,212200.0,212200.0,...,321000,320600,320200,320400,320800,321200,321200,323000,326900,329900
3,84616,60614,Chicago,IL,Chicago,Cook,4,498100.0,500900.0,503100.0,...,1289800,1287700,1287400,1291500,1296600,1299000,1302700,1306400,1308500,1307000
4,93144,79936,El Paso,TX,El Paso,El Paso,5,77300.0,77300.0,77300.0,...,119100,119400,120000,120300,120300,120300,120300,120500,121000,121500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14718,58333,1338,Ashfield,MA,Greenfield Town,Franklin,14719,94600.0,94300.0,94000.0,...,216800,217700,218600,218500,218100,216400,213100,209800,209200,209300
14719,59107,3293,Woodstock,NH,Claremont,Grafton,14720,92700.0,92500.0,92400.0,...,202100,208400,212200,215200,214300,213100,213700,218300,222700,225800
14720,75672,40404,Berea,KY,Richmond,Madison,14721,57100.0,57300.0,57500.0,...,121800,122800,124600,126700,128800,130600,131700,132500,133000,133400
14721,93733,81225,Mount Crested Butte,CO,,Gunnison,14722,191100.0,192400.0,193700.0,...,662800,671200,682400,695600,695500,694700,706400,705300,681500,664400


In [7]:
#separate out the Metro areas of San Jose
data_sj = data[(data.Metro == 'San Jose') & (data.State == 'CA')]
print(data_sj.City.unique())
print(data_sj.shape)

['San Jose' 'Santa Clara' 'Milpitas' 'Cupertino' 'Sunnyvale' 'Campbell'
 'Gilroy' 'Morgan Hill' 'Hollister' 'Mountain View' 'Palo Alto' 'Saratoga'
 'Los Gatos' 'Los Altos' 'Stanford' 'Lexington Hills' 'San Martin'
 'San Juan Bautista' 'Aromas']
(55, 272)


In [8]:
#use the pandas.melt() function to convert wide dataframe, 272 columns, to long version:
def melt_data(df):
    melted = pd.melt(df, id_vars=['RegionID','RegionName', 'City', 'State', 'Metro', 
                                  'CountyName','SizeRank'], var_name='time')
    melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True, format='%Y-%m-%d')
    melted = melted.dropna(subset=['value'])
    return melted.groupby('time').aggregate({'value':'mean'})

In [9]:
data = melt_data(data)
data

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
1996-04-01,118299.123063
1996-05-01,118419.044139
1996-06-01,118537.423268
1996-07-01,118653.069278
1996-08-01,118780.254312
1996-09-01,118927.5285
1996-10-01,119120.520316
1996-11-01,119345.352236
1996-12-01,119685.084771
1997-01-01,120121.265712


In [10]:
data.value.count()

265

In [12]:
#for san jose metro area:
def melt_data_new(df):
    melted = pd.melt(df, id_vars=['RegionID','RegionName', 'City', 'State', 'Metro', 
                                  'CountyName','SizeRank'], var_name='time')
    melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True, format='%Y-%m-%d')
    melted = melted.dropna(subset=['value'])
    #return melted.groupby('time').aggregate({'value':'mean'})
    melted.set_index(keys='time', inplace=True)
    return melted

In [13]:
data_sj= melt_data_new(data_sj)
data_sj.head(100)

Unnamed: 0_level_0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,value
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1996-04-01,97991,95123,San Jose,CA,San Jose,Santa Clara,245,224500.0
1996-04-01,97952,95051,Santa Clara,CA,San Jose,Santa Clara,290,258700.0
1996-04-01,97940,95035,Milpitas,CA,San Jose,Santa Clara,364,231100.0
1996-04-01,97926,95014,Cupertino,CA,San Jose,Santa Clara,462,432400.0
1996-04-01,97980,95112,San Jose,CA,San Jose,Santa Clara,514,186800.0
1996-04-01,97549,94087,Sunnyvale,CA,San Jose,Santa Clara,517,361300.0
1996-04-01,97993,95125,San Jose,CA,San Jose,Santa Clara,537,286000.0
1996-04-01,97548,94086,Sunnyvale,CA,San Jose,Santa Clara,583,310400.0
1996-04-01,97920,95008,Campbell,CA,San Jose,Santa Clara,657,262100.0
1996-04-01,97931,95020,Gilroy,CA,San Jose,Santa Clara,852,238000.0
