In [1]:
from matplotlib import pyplot as plt

import arviz as az
import numpy as np
import pandas as pd
import pymc3 as pm
from sklearn.metrics import mean_squared_error as mse; rmse = lambda *x: mse(*x)**0.5

from custome import DataFrameHandler, CrossValSampler, get_Xy, BLinearRegression

In [2]:
# Global const.
TRAIN_TO_ALL_DATA_RATION = 8 / 10
DROP_COLUMNS = ['sex', 'smoker', 'region']
CHECK_COLUMNS = ['female', 'male', 'southwest', 'southeast', 'northwest', 'northeast']
STATISTICS_COUNT_COLUMNS = [
    'age', 'bmi', 'children', 'charges', 'female', 'male', 'northeast', 'northwest', 'southeast', 'southwest'
]

In [3]:
data_handler = DataFrameHandler()

# Load all data and shuffle it.
df = pd.read_csv('data/insurance.csv')
df = df.sample(df.shape[0], random_state=7).reset_index(drop=True)

train_len = round(df.shape[0] * TRAIN_TO_ALL_DATA_RATION)

# Devide all data to train/test parts.
train_df = df.iloc[:train_len]
test_df = df.iloc[train_len:]

# Prepare data.
train_df = data_handler.prepare_dataset(train_df, DROP_COLUMNS, CHECK_COLUMNS)
train_statistics = data_handler.count_statistics(train_df, STATISTICS_COUNT_COLUMNS)
train_df = data_handler.normalize_dataset(train_df, train_statistics)

test_df = data_handler.prepare_dataset(test_df, DROP_COLUMNS, CHECK_COLUMNS)
test_df = data_handler.normalize_dataset(test_df, train_statistics)

In [4]:
X, y = get_Xy(train_df, target_name="charges")

In [None]:
train_val_sampler = CrossValSampler(X, y, val_size=300)

train, val = train_val_sampler()
X_train, y_train = train
X_val, y_val = val

model = BLinearRegression(chains=2)
model.fit(X_train, y_train)

  return wrapped_(*args_, **kwargs_)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma, w]


In [None]:
az.summary(model.trace, round_to=2)

In [None]:
az.plot_trace(model.trace, compact=False);

In [None]:
az.plot_forest(model.trace, hdi_prob=0.95, figsize=(6, 4));

In [None]:
az.plot_posterior(model.trace, hdi_prob=0.95, figsize=(36, 18), lw=3);

In [None]:
# So, it is observed that the model has completely converged!

In [None]:
# Let's look in model some deeper.

In [None]:
train_val_sampler = CrossValSampler(X, y, val_size=300)

train, val = train_val_sampler()
X_train, y_train = train
X_val, y_val = val

model = BLinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_conf_pred = model.predict_confidence(X_val)
rmse(y_val, y_pred)

In [None]:
# Model architecture.
pm.model_to_graphviz(model._pm_model)

In [None]:
th_median = 0.0965

print(len(np.where(y_conf_pred < th_median)[0]) / len(y_conf_pred))  # Shoud be ~0.5~

In [None]:
# Confident prediciton.

indexes = np.where(y_conf_pred < th_median)[0]  # 50% left.

plt.scatter(np.linspace(0, 1, len(y_val)), y_val, 4.5, label='y_val')
plt.scatter(np.linspace(0, 1, len(y_val[indexes])), y_pred[indexes], 30.5, label='y_pred_conf')
plt.scatter(np.linspace(0, 1, len(y_val[indexes])), y_val[indexes], 30.5, label='y_val_conf')
plt.legend()
plt.grid();

print('RMSE: %.4f' % rmse(y_val[indexes], y_pred[indexes]))

In [None]:
# UnConfident prediciton.

indexes = np.where(y_conf_pred > th_median)[0]  # 50% right.

plt.scatter(np.linspace(0, 1, len(y_val)), y_val, 4.5, label='y_val')
plt.scatter(np.linspace(0, 1, len(y_val[indexes])), y_pred[indexes], 30.5, label='y_pred_conf')
plt.scatter(np.linspace(0, 1, len(y_val[indexes])), y_val[indexes], 30.5, label='y_val_conf')
plt.legend()
plt.grid();

print('RMSE: %.4f' % rmse(y_val[indexes], y_pred[indexes]))

In [None]:
# So, we see, that on unconfident samples score worse.

In [None]:
# Let's try to investigate the differences in the distributions of features stratified by the reliability of the model.

In [None]:
indexes = np.where(y_conf_pred > th_median)[0]

train_df.iloc[np.where(y_conf_pred > th_median)[0]].age.hist(bins=10, alpha=0.75, legend='50% most confident')
train_df.iloc[np.where(y_conf_pred < th_median)[0]].age.hist(bins=10, alpha=0.75, legend='50% most confident')

plt.legend()
plt.grid();

In [None]:
indexes = np.where(y_conf_pred > th_median)[0]

train_df.iloc[np.where(y_conf_pred > th_median)[0]].bmi.hist(bins=10, alpha=0.75, legend='50% most confident')
train_df.iloc[np.where(y_conf_pred < th_median)[0]].bmi.hist(bins=10, alpha=0.75, legend='50% most confident')

plt.legend()
plt.grid();