# Experiment of Housing prices dataset

Obtained from: 
https://my.spindices.com/indices/real-estate/sp-corelogic-case-shiller-20-city-composite-home-price-nsa-index

In [1]:
# execute if not installed
import sys
sys.path.insert(0, '../')

import mogptk

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

%reload_ext autoreload
%autoreload 2

In [2]:
# plot config
sns.set_context('paper', font_scale=1.3)
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (10, 5)

In [3]:
def test_errors(model):
    """
    Obtain test errors for model
    
    The function assumes the original data includes the test data
    """
    x_pred = {i:array[stop:] for i, array in enumerate(model.data.X_all)}
    
    y_pred, var_pred = model.predict(x_pred)
    
    n_channels = model.data.get_output_dims()
    
    mae = np.zeros(n_channels)
    mape = np.zeros(n_channels)
    mse = np.zeros(n_channels)
    
    for i in range(n_channels):
        idx = model.data.Y_all[i] != 0
        y_true = model.data.Y_all[i][stop:]
        
        mae[i] = np.abs(y_true - y_pred[i]).mean()
        mape[i] = np.abs((y_true - y_pred[i]) / y_true).mean()
        mse[i] = ((y_true - y_pred[i])**2).mean()
    return mae, mape, mse

In [4]:
def plot_experiment(model, ylims=None, names=[], title='Finance experiment'):
    """
    Plot finance exchange mogptk experiment
    """
    x_train = model.data.X
    y_train = model.data.Y
    x_all = model.data.X_all
    y_all = model.data.Y_all
    x_pred = {i:array for i, array in enumerate(data.X_all)}

    mean_pred, var_pred = model.predict(x_pred)
    
    f, axarr = plt.subplots(3, 2, sharex=True, figsize=(20, 15))
    axarr = axarr.reshape(-1)
    
    for i in range(len(x_train)):
        axarr[i].plot(x_train[i][:, 0], y_train[i], '.k', label='Train', )
        axarr[i].plot(x_all[i][:, 0], y_all[i], '--', label='Test', c='gray')
        
        axarr[i].plot(x_pred[i][:, 0], mean_pred[i], label='Pred', c=sns.color_palette()[i%10])
        axarr[i].fill_between(x_pred[i][:, 0].reshape(-1),
                              mean_pred[i] + 2 * np.sqrt(var_pred[i]),
                              mean_pred[i] - 2 * np.sqrt(var_pred[i]),
                              label='95% c.i',
                              color=sns.color_palette()[i%10],
                              alpha=0.4)
        
        axarr[i].legend(ncol=4)
        axarr[i].set_title(names[i])
        axarr[i].set_xlim(-1, x_all[i][-1])
    plt.suptitle(title, y=1.02)
    plt.tight_layout()

### Load dataset

In [5]:
# load dataset 
cols = ['Chicago', 'Phoenix', 'Los Angeles', 'San Diego', 'New York', 'San Francisco']

df = pd.read_csv('data/finance_datasets/housing_prices_usa/housing_dataset.csv',
                 skiprows=7,
                 names=cols)
df = df.iloc[:-4]
df.index = pd.to_datetime(df.index)

df['Month'] = np.round(((df.index - df.index[0]) / np.timedelta64(1, 'M')).values)
df.head()

Unnamed: 0,Chicago,Phoenix,Los Angeles,San Diego,New York,San Francisco,Month
2009-05-01,123.73,103.56,159.18,145.06,171.17,120.16,0.0
2009-06-01,124.99,104.73,160.9,147.31,172.34,124.7,1.0
2009-07-01,128.32,106.66,163.97,150.99,173.91,128.86,2.0
2009-08-01,130.55,108.41,166.62,153.34,175.13,132.47,3.0
2009-09-01,132.13,109.26,168.03,154.76,174.81,134.16,4.0


### Data

In [6]:
data = mogptk.Data()

for i, c in enumerate(cols):
    x = df['Month'].values.reshape(-1, 1)
    y = df[c].values
    # y = np.log(df[c].values)
    
    data.add(x, y)
    
    stop = int(len(df) * 0.9)
    data.remove_range(i, df['Month'][stop], None)

### Model 

In [None]:
n_trials = 2

mosm_mae = np.zeros((n_trials, len(cols)))
mosm_mape = np.zeros((n_trials, len(cols)))
mosm_mse = np.zeros((n_trials, len(cols)))

for n in range(n_trials):
    model = mogptk.MOSM(data, Q=3)
    model.init_params('means')
    
    model.train(method='L-BFGS-B', maxiter=3000, tol=1e-50)
    
    mae, mape, mse = test_errors(model)
    
    mosm_mae[n, :] = mae
    mosm_mape[n, :] = mape
    mosm_mse[n, :] = mse

In [None]:
pd.DataFrame(np.c_[mosm_mae.mean(0), mosm_mape.mean(0), mosm_mse.mean(0),
                             mosm_mae.std(0), mosm_mape.std(0), mosm_mse.std(0)],
                       columns=['MAE', 'MAPE%', 'MSE', 'MAE std', 'MAPE% std', 'MSE std'])

In [None]:
plot_experiment(model, names=cols)