In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## 1. Getting preprocessed data

In [2]:
data = pd.read_csv('data_set.csv',nrows = 264)
data = data.drop(data.columns[0],axis=1)
data.head(5)

Unnamed: 0,obs_date,mortgage_values,real_disposable_income,existing_home_sales,housing_starts,supply_of_housing,new_homes_supply,existing_houses_months_supply,production_price_index
0,2000-01-01,8.21,9309.1,5230.0,1636.0,4.3,4.641791,4.2,142.4
1,2000-02-01,8.325,9345.2,5120.0,1737.0,4.3,3.833333,4.7,142.7
2,2000-03-01,8.24,9370.3,5190.0,1604.0,4.3,3.556818,4.1,143.2
3,2000-04-01,8.1525,9418.3,5200.0,1626.0,4.4,3.910256,4.6,143.2
4,2000-05-01,8.515,9457.3,5110.0,1575.0,4.4,3.961039,4.6,142.2


In [3]:
csi = pd.read_csv('case-schiller-home-price-index.csv',names=['obs_date','cs_index'],header = 0,skiprows =156)
csi=csi.drop(labels = 264,axis=0)
csi.head(5)

Unnamed: 0,obs_date,cs_index
0,1/1/2000,100.0
1,2/1/2000,100.571
2,3/1/2000,101.466
3,4/1/2000,102.54
4,5/1/2000,103.701


In [4]:
final_data = data.copy()
final_data['csi'] = csi['cs_index']


In [5]:
final_data.head(5)

Unnamed: 0,obs_date,mortgage_values,real_disposable_income,existing_home_sales,housing_starts,supply_of_housing,new_homes_supply,existing_houses_months_supply,production_price_index,csi
0,2000-01-01,8.21,9309.1,5230.0,1636.0,4.3,4.641791,4.2,142.4,100.0
1,2000-02-01,8.325,9345.2,5120.0,1737.0,4.3,3.833333,4.7,142.7,100.571
2,2000-03-01,8.24,9370.3,5190.0,1604.0,4.3,3.556818,4.1,143.2,101.466
3,2000-04-01,8.1525,9418.3,5200.0,1626.0,4.4,3.910256,4.6,143.2,102.54
4,2000-05-01,8.515,9457.3,5110.0,1575.0,4.4,3.961039,4.6,142.2,103.701


## 2. Correlation between features and label

In [6]:
print("Correlation with csi")
final_data.corr()['csi']

Correlation with csi


mortgage_values                 -0.638130
real_disposable_income           0.857749
existing_home_sales              0.255072
housing_starts                   0.082479
supply_of_housing                0.135237
new_homes_supply                 0.137639
existing_houses_months_supply   -0.332432
production_price_index           0.801976
csi                              1.000000
Name: csi, dtype: float64

## 3.Stationarity

- Performing Augmented Dickey-Fuller Test and KPSS test to evaluate if data is stationary.
- Differenced values for each feature is checked.

### Normalizing Data

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
temp_data = final_data.iloc[0:,1:]
data_scaled = scaler.fit_transform(temp_data.values)
data_scaled = pd.DataFrame(data_scaled, index=final_data.index, columns=['mortgage_values', 'real_disposable_income',
       'existing_home_sales', 'housing_starts', 'supply_of_housing',
       'new_homes_supply', 'existing_houses_months_supply',
       'production_price_index', 'csi'])

In [8]:
data_scaled

Unnamed: 0,mortgage_values,real_disposable_income,existing_home_sales,housing_starts,supply_of_housing,new_homes_supply,existing_houses_months_supply,production_price_index,csi
0,2.348824,-1.621751,-0.152857,0.769671,-0.774633,-0.570101,-0.595712,-1.460915,-1.815957
1,2.432862,-1.602129,-0.284187,0.992233,-0.774633,-0.963665,-0.372384,-1.451978,-1.800280
2,2.370747,-1.588486,-0.200613,0.699156,-0.774633,-1.098275,-0.640378,-1.437084,-1.775707
3,2.306805,-1.562396,-0.188674,0.747635,-0.721487,-0.926218,-0.417049,-1.437084,-1.746220
4,2.571708,-1.541197,-0.296126,0.635252,-0.721487,-0.901497,-0.417049,-1.466873,-1.714344
...,...,...,...,...,...,...,...,...,...
259,-1.573580,1.862867,0.754515,0.630845,0.553999,0.515938,-1.310364,2.367947,2.820898
260,-1.531561,1.724970,0.981357,0.580162,0.288273,0.342884,-1.399695,2.379862,2.894615
261,-1.409157,1.713120,0.993297,0.584569,0.660290,0.873798,-1.399695,2.506824,2.956445
262,-1.409157,1.701543,1.160444,0.917311,0.235128,0.677060,-1.533693,2.634828,3.022942


### KPSS and AD FULLER test

#### 5% tolerence is used for both tests

In [9]:
from statsmodels.tsa.stattools import adfuller, kpss
result = {}
columns = data_scaled.columns[:-1]
for column in columns :
    x = data_scaled[column].to_numpy()
    x_diff = (x[1:]-x[:-1])/x[:-1]
    result_adf = adfuller(x)
    result_adf_diff = adfuller(x_diff)
    result_kpss = kpss(x)
    result_kpss_diff = kpss(x_diff)
    result[column] = {"ADF Test (original)" : result_adf[0] < result_adf[4]['5%'], "KPSS Test (original)" : result_kpss[0] < result_kpss[3]['5%'], "ADF Test (differencing)" : result_adf_diff[0] < result_adf_diff[4]['5%'], "KPSS Test (differencing)" : result_kpss_diff[0] < result_kpss_diff[3]['5%']}
pd.DataFrame(result.values(), index=result.keys())

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is greater than t

Unnamed: 0,ADF Test (original),KPSS Test (original),ADF Test (differencing),KPSS Test (differencing)
mortgage_values,False,False,True,True
real_disposable_income,False,False,False,True
existing_home_sales,False,True,True,True
housing_starts,False,False,True,True
supply_of_housing,False,True,True,True
new_homes_supply,False,True,True,True
existing_houses_months_supply,False,False,True,True
production_price_index,False,False,True,True


- As seen above, most of the features fail both or one tests of stationarity. Stationarity is resolved by the method of differencing.but real_disposable_income fails AD Fuller test after differencing also.
- Trends like 2008 crises is difficult to model quantitatively.


In [10]:
data_ex = data_scaled.copy()
data_ex = data_ex.drop('real_disposable_income',axis=1)

## 4.Modelling

#### To capture long term trends ARIMA is used for modelling ( generally ARIMA is good at modelling time series data when features are differentiated)

In [12]:
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
arima = ARIMA(endog=data_scaled['csi'], order=(1,0,1), exog=data_scaled[data_scaled.columns[:-1]])
arima_fit = arima.fit()
arima_fit.summary()



0,1,2,3
Dep. Variable:,csi,No. Observations:,264.0
Model:,"ARIMA(1, 0, 1)",Log Likelihood,579.895
Date:,"Fri, 15 Apr 2022",AIC,-1135.79
Time:,01:50:09,BIC,-1092.878
Sample:,0,HQIC,-1118.547
,- 264,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0711,0.327,0.218,0.828,-0.569,0.711
mortgage_values,0.0226,0.008,2.867,0.004,0.007,0.038
real_disposable_income,-0.0042,0.002,-2.109,0.035,-0.008,-0.000
existing_home_sales,-0.0047,0.005,-0.945,0.344,-0.014,0.005
housing_starts,0.0027,0.003,0.933,0.351,-0.003,0.008
supply_of_housing,0.0031,0.006,0.538,0.591,-0.008,0.014
new_homes_supply,-0.0006,0.004,-0.143,0.886,-0.008,0.007
existing_houses_months_supply,-0.0196,0.006,-3.294,0.001,-0.031,-0.008
production_price_index,0.0392,0.013,2.971,0.003,0.013,0.065

0,1,2,3
Ljung-Box (L1) (Q):,37.91,Jarque-Bera (JB):,42.02
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,1.67,Skew:,-0.21
Prob(H) (two-sided):,0.02,Kurtosis:,4.91


Observations:
- The features 'mortgage_values','existing_houses_month_supply' have p value less than the threshold,as expected,these statistics are not enough to conclude valid results.
- Data regarding the parameters like ***house affordability index*** ,etc.., are yet to examined..
- There is a large amount of personal bias in selection of variables for supply and demand which leads to unexpected results.