In [22]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import ssl
import pandas as pd
import pymc3 as pm 
import numpy as np
import arviz as az

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['figure.figsize'] = (9, 9)

import seaborn as sns

from IPython.core.pylabtools import figsize

from scipy.stats import percentileofscore
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

import scipy

### THIS METHOD WAS TRIED BUT DID NOT WORK WELL. NOTED IN THE REPORT

In [None]:
def evaluate(X_train, X_test, y_train, y_test):
    model_name_list = ['Linear Regression', 'Random Forest']
    
    model_lr = LinearRegression()
    model_rf = RandomForestRegressor(n_estimators=50)  # n_estimator == number of trees has been chosen randomly 
    
    results = pd.DataFrame(columns=['mae', 'rmse'], index=model_name_list)

    for i, model in enumerate([model_lr, model_rf]):
        # y_train is now column vectore thus should be changed to row vecotor by using .values.ravel()
        model.fit(X_train, y_train.values.ravel())
        predictions = model.predict(X_test)
        
        mae = np.mean(abs(predictions-y_test.values.ravel()))
        rmse = np.sqrt(np.mean((predictions-y_test.values.ravel())**2))
        
        model_name = model_name_list[i]
        results.loc[model_name, :] = [mae, rmse]

    return results

In [None]:
def evaluate_trace(trace, X_test, y_test, model_results):
    
    var_dict = {}
    for variable in trace.varnames:
        var_dict[variable] = trace[variable]
        
    var_weights = pd.DataFrame(var_dict)
    
    var_means = var_weights.mean(axis=0)
    
    X_test['Intercept'] = 1
    
    names = X_test.columns[1:]
    X_test = X_test.loc[:, names]
    var_means = var_means[names]
    
    results = pd.DataFrame(index = X_test.index, columns=['estimate'])
    
    for row in X_test.iterrows():
        results.loc[row[0], 'estimate'] = np.dot(np.array(var_means), np.array(row[1]))
    
    actual = np.array(y_test)
    errors = results['estimate'] - actual.ravel()
    mae = np.mean(abs(errors))
    rmse = np.sqrt(np.mean(errors ** 2))
    
    print("mae", mae)
    print("rmse", rmse)
    
    model_results.loc['Bayesian LR', :] = [mae, rmse]
    
    return model_results

In [None]:
housing = fetch_california_housing()

In [None]:
scaler = StandardScaler()
housing.data = scaler.fit_transform(housing.data)

In [None]:
df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
df_target = pd.DataFrame(data=housing.target, columns=['MedHouseVal'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, df_target, test_size=0.2, random_state=42)

In [None]:
results = evaluate(X_train, X_test, y_train, y_test)

In [None]:
X_train = pd.concat([y_train, X_train], axis=1)
X_test  = pd.concat([y_test,  X_test], axis=1)

In [32]:
X_train

Unnamed: 0,MedHouseVal,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
14196,1.030,-0.321654,0.346478,-0.166259,-0.190451,0.772251,0.059808,-1.367976,1.267645
8267,3.821,-0.030620,1.617807,-0.386181,-0.117472,-0.098440,-0.128306,-0.871699,0.703627
17445,1.726,0.150349,-1.957806,0.087641,-0.235400,-0.450778,-0.033453,-0.455012,-0.454356
14265,0.934,-1.014947,0.584852,-0.576442,-0.132670,-0.006602,0.088940,-1.377340,1.227714
2271,0.965,-0.166583,1.141059,0.339282,0.079205,-0.486983,-0.074203,0.537543,-0.114948
...,...,...,...,...,...,...,...,...,...
11284,2.292,1.315592,0.505394,0.282943,-0.359587,-0.677723,-0.003697,-0.867017,0.803453
11964,0.978,-0.431983,0.346478,0.581864,0.364661,0.289220,0.080261,-0.754652,1.067992
5390,2.221,-0.492832,0.584852,-0.582949,-0.035828,0.291870,0.025170,-0.749970,0.593818
860,2.835,0.973025,-1.083767,0.390584,-0.060554,0.310414,0.010422,0.912092,-1.193070


In [29]:
idxs = []
np_idxs = []

for i in range(X_train.shape[0]):
    np.array(idxs.append(i))
    
np_idxs = np.array(idxs)
samples = X_train.shape[0]

In [30]:
np_idxs

array([    0,     1,     2, ..., 16509, 16510, 16511])

In [35]:
with pm.Model() as hierarchical_model:
    # Hyperpriors for group nodes
    Intercept  = pm.Normal('Intercept', mu=0., sigma=1)
    HouseAge   = pm.Normal('HouseAge', mu=0., sigma=1)
    MedInc     = pm.Normal('MedInc', mu=0., sigma=1)
    AveRooms   = pm.Normal('AveRooms', mu=0., sigma=1)
    AveBedrms  = pm.Normal('AveBedrms', mu=0., sigma=1)
    Population = pm.Normal('Population', mu=0., sigma=1)
    AveOccup   = pm.Normal('AveOccup', mu=0., sigma=1)
    Latitude   = pm.Normal('Latitude', mu=0., sigma=1)
    Longitude  = pm.Normal('Longitude', mu=0., sigma=1)
    
    Intercept_sigma  = pm.HalfNormal('Intercept_sigma', 5.)
    MedInc_sigma     = pm.HalfNormal('MedInc_sigma', 5.)
    HouseAge_sigma   = pm.HalfNormal('HouseAge_sigma', 5.)
    AveRooms_sigma   = pm.HalfNormal('AveRooms_sigma', 5.)
    AveBedrms_sigma  = pm.HalfNormal('AveBedrms_sigma', 5.)
    Population_sigma = pm.HalfNormal('Population_sigma', 5.)
    AveOccup_sigma   = pm.HalfNormal('AveOccup_sigma', 5.)
    Latitude_sigma   = pm.HalfNormal('Latitude_sigma', 5.)
    Longitude_sigma  = pm.HalfNormal('Longitude_sigma', 5.)

    # ----------------------------------------------------------------------#
    intercept  = pm.Normal('intercept',  mu=Intercept,  sigma=Intercept_sigma,  shape=samples)
    medinc     = pm.Normal('medinc',     mu=MedInc,     sigma=MedInc_sigma,     shape=samples)
    houseavg   = pm.Normal('houseavg',   mu=HouseAge,   sigma=HouseAge_sigma,   shape=samples)
    avgrooms   = pm.Normal('avgrooms',   mu=AveRooms,   sigma=AveRooms_sigma,   shape=samples)
    avgbedrms  = pm.Normal('avgbedrms',  mu=AveBedrms,  sigma=AveBedrms_sigma,  shape=samples)
    population = pm.Normal('population', mu=Population, sigma=Population_sigma, shape=samples)
    aveoccup   = pm.Normal('aveoccup',   mu=AveOccup,   sigma=AveOccup_sigma,   shape=samples)
    latitude   = pm.Normal('latitude',   mu=Latitude,   sigma=Latitude_sigma,   shape=samples)
    longitude  = pm.Normal('longitude',  mu=Longitude,  sigma=Longitude_sigma,  shape=samples)

    # Model error
    eps = pm.HalfCauchy('eps', 5.)

    price_est = (intercept[np_idxs] + 
                 medinc[np_idxs]    * X_train.MedInc.values + 
                 houseavg[np_idxs]  * X_train.HouseAge.values + 
                 avgrooms[np_idxs]  * X_train.AveRooms.values + 
                 avgbedrms[np_idxs] * X_train.AveBedrms.values + 
                 population[np_idxs]* X_train.Population.values + 
                 aveoccup[np_idxs]  * X_train.AveOccup.values + 
                 latitude[np_idxs]  * X_train.Latitude.values + 
                 longitude[np_idxs] * X_train.Longitude.values  
                )

    # Data likelihood
    price_like = pm.Normal('MedHouseVal likelihood', mu=price_est,
                           sigma=eps, observed=X_train.MedHouseVal)

In [36]:
# with hierarchical_model:
#     hierarchical_trace = pm.sample(2000, tune=2000, target_accept=.9)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [eps, longitude, latitude, aveoccup, population, avgbedrms, avgrooms, houseavg, medinc, intercept, Longitude_sigma, Latitude_sigma, AveOccup_sigma, Population_sigma, AveBedrms_sigma, AveRooms_sigma, HouseAge_sigma, MedInc_sigma, Intercept_sigma, Longitude, Latitude, AveOccup, Population, AveBedrms, AveRooms, MedInc, HouseAge, Intercept]


Sampling 2 chains for 2_000 tune and 2_000 draw iterations (4_000 + 4_000 draws total) took 10556 seconds.
The acceptance probability does not match the target. It is 0.7187652760431438, but should be close to 0.9. Try to increase the number of tuning steps.
There were 553 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.7295334135215453, but should be close to 0.9. Try to increase the number of tuning steps.
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.


In [38]:
pm.traceplot(hierarchical_trace)



KeyboardInterrupt: 

Error in callback <function flush_figures at 0x7fdfb7a3fa60> (for post_execute):


KeyboardInterrupt: 

In [None]:
pm.summary(hierarchical_trace)



In [None]:
all_model_results = evaluate_trace(hierarchical_trace, X_test, y_test, results)
all_model_results