In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [65]:
import warnings
warnings.filterwarnings("ignore")

In [66]:
df = pd.read_csv("Neighbourhood_model_data.csv")

# get the date time in correct format and then set it as index
# no missing data here otherwise we need to handle that also

df['TAX_ASSESSMENT_YEAR'] = pd.to_datetime(df.TAX_ASSESSMENT_YEAR, format = '%Y')
df1 = df.drop(['TAX_ASSESSMENT_YEAR'], axis = 1)
df1.index = df.TAX_ASSESSMENT_YEAR

## drop not required columns
truncated_df = df1.drop(['PREVIOUS_MEDIAN_PRICE'], axis = 1)
truncated_df

Unnamed: 0_level_0,NEIGHBOURHOOD_NAME,INTEREST_RATE,CURRENT_MEDIAN_PRICE
TAX_ASSESSMENT_YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-01-01,Arbutus-Ridge,6.104167,1.01700
2008-01-01,Arbutus-Ridge,4.729167,1.22905
2009-01-01,Arbutus-Ridge,2.395833,1.23490
2010-01-01,Arbutus-Ridge,2.604167,1.22900
2011-01-01,Arbutus-Ridge,3.000000,1.51500
2012-01-01,Arbutus-Ridge,3.000000,1.90910
2013-01-01,Arbutus-Ridge,3.000000,1.82600
2014-01-01,Arbutus-Ridge,3.000000,1.82455
2015-01-01,Arbutus-Ridge,2.775000,1.98200
2016-01-01,Arbutus-Ridge,2.700000,2.36600


In [67]:
### constants
area_list = df.NEIGHBOURHOOD_NAME.unique()
area_list

array(['Arbutus-Ridge', 'Downtown', 'Dunbar-Southlands', 'Fairview',
       'Grandview-Woodland', 'Hastings-Sunrise',
       'Kensington-Cedar Cottage', 'Kerrisdale', 'Killarney', 'Kitsilano',
       'Marpole', 'Mount Pleasant', 'Oakridge', 'Renfrew-Collingwood',
       'Riley Park', 'Shaughnessy', 'South Cambie', 'Sunset',
       'Victoria-Fraserview', 'West End', 'West Point Grey'], dtype=object)

In [68]:
truncated_df.dtypes

NEIGHBOURHOOD_NAME       object
INTEREST_RATE           float64
CURRENT_MEDIAN_PRICE    float64
dtype: object

In [69]:
from statsmodels.tsa.vector_ar.var_model import VAR
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

rmse_list = []

for area in area_list:
    single_area_df = truncated_df.query('NEIGHBOURHOOD_NAME == @area')
    single_area_df = single_area_df.drop(['NEIGHBOURHOOD_NAME'], axis = 1)
    
    train = single_area_df[:int(0.8*(len(single_area_df)))]
    valid = single_area_df[int(0.8*(len(single_area_df))):]
        
    # model training
    model = VAR(endog=train)
    model_fit = model.fit()

    # make prediction on validation
    prediction = model_fit.forecast(model_fit.y, steps=len(valid))
    
    cols = single_area_df.columns

    pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
    for j in range(0,2):
        for i in range(0, len(prediction)):
           pred.iloc[i][j] = prediction[i][j]
        
    col_name = 'CURRENT_MEDIAN_PRICE'
    
    #check rmse
    rmse_val = sqrt(mean_squared_error(pred[col_name], valid[col_name]))
    rmse_list.append(rmse_val)
    print('rmse value:', rmse_val)

rmse value: 0.5253942307912431
rmse value: 0.2388527291072487
rmse value: 0.5488239283275378
rmse value: 0.22561293221948228
rmse value: 0.23084112732777415
rmse value: 0.4866172878356836
rmse value: 0.2855534537662852
rmse value: 0.622271722667437
rmse value: 0.1925420730046522
rmse value: 0.30537735283517836
rmse value: 0.2598463926764876
rmse value: 0.2365249495597112
rmse value: 0.2713504568414278
rmse value: 0.3683659941493735
rmse value: 0.5155533068320779
rmse value: 0.5287589659955316
rmse value: 0.3677140067422959
rmse value: 0.33029015826471225
rmse value: 0.42702869227410617
rmse value: 0.23351075314915531
rmse value: 0.5102158065740894


In [70]:
average_rmse = np.mean(rmse_list)
average_rmse

0.36719268194959487