# Titanic data set example

In [2]:
from bhad.utils import (discretize, mvt2mixture)
from bhad.model import BHAD
import numpy as np
import matplotlib.pyplot as plt
from importlib import reload
from sklearn.datasets import fetch_openml


In [3]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True, parser="pandas")
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
X_cleaned = X.drop(['body', 'cabin', 'name', 'ticket', 'boat'], axis=1).dropna()
y_cleaned = y[X_cleaned.index]

X_cleaned.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 1281
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     684 non-null    int64   
 1   sex        684 non-null    category
 2   age        684 non-null    float64 
 3   sibsp      684 non-null    int64   
 4   parch      684 non-null    int64   
 5   fare       684 non-null    float64 
 6   embarked   684 non-null    category
 7   home.dest  684 non-null    object  
dtypes: category(2), float64(2), int64(3), object(1)
memory usage: 38.9+ KB


Partition dataset:

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.33, random_state=42)

print(X_train.shape)
print(X_test.shape)

print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))


(458, 8)
(226, 8)
(array(['0', '1'], dtype=object), array([242, 216]))
(array(['0', '1'], dtype=object), array([122, 104]))


## Model explanation:

Retrieve local model explanations. Here: Specify all numeric and categorical columns explicitly

In [6]:
numeric_cols = list(X_train.select_dtypes(include=['float', 'int']).columns) 
cat_cols = list(X_train.select_dtypes(include=['object', 'category']).columns)
print(len(cat_cols+numeric_cols))


8


In [7]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[
    ('discrete' , discretize(nbins = None, verbose = True)),     
    ('model', BHAD(contamination = 0.01, numeric_features = numeric_cols, cat_features = cat_cols))
])

y_pred_train = pipe.fit_predict(X_train)


-- Bayesian Histogram-based Anomaly Detector (BHAD) --

Reseting index of input dataframe.
Input shape: (458, 8)
Used 5 numeric feature(s) and 3 categorical feature(s).
Determining optimal number of bins for numeric features
Feature pclass using 49 bins
Determining optimal number of bins for numeric features
Feature age using 5 bins
Determining optimal number of bins for numeric features
Feature sibsp using 49 bins
Determining optimal number of bins for numeric features
Feature parch using 49 bins
Determining optimal number of bins for numeric features
Feature fare using 16 bins
Finished 'fit' in 0.0780 secs
Finished 'transform' in 0.0038 secs
Fit BHAD on discretized data.
Input shape: (458, 8)
One-hot encoding categorical features.
Finished 'fit' in 0.0097 secs
Finished 'transform' in 0.0526 secs
Matrix dimension after one-hot encoding: (458, 319)
Finished training.
Score input data.


In [8]:
from bhad import explainer

reload(explainer)

local_expl = explainer.Explainer(pipe.named_steps['model'], pipe.named_steps['discrete']).fit()

--- BHAD Model Explainer ---

Using fitted BHAD and fitted discretizer.
Marginal cdfs estimated using train set of shape (458, 8)


In [9]:
df_train, _ = local_expl.get_explanation()
df_train.shape

Create local explanations for 458 observations.


  0%|          | 0/458 [00:00<?, ?it/s]

Finished 'get_explanation' in 0.2168 secs


(458, 9)

In [10]:
y_pred_test = pipe.predict(X_test)
X_test.shape

Reseting index of input dataframe.
Finished 'transform' in 0.3337 secs

Score input data.
Apply fitted one-hot encoder.
Finished 'transform' in 0.0301 secs


(226, 8)

In [11]:
df_test, _ = local_expl.get_explanation()
df_test.shape

Create local explanations for 226 observations.


  0%|          | 0/226 [00:00<?, ?it/s]

Finished 'get_explanation' in 0.1049 secs


(226, 9)

In [12]:
X_test.dtypes

pclass          int64
sex          category
age           float64
sibsp           int64
parch           int64
fare          float64
embarked     category
home.dest      object
dtype: object

In [13]:
df_test #.head(2)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,home.dest,explanation
0,2.0,male,36.0,1.0,2.0,27.7500,S,"Bournmouth, England","home.dest (Perc.: 0.0): Bournmouth, England\np..."
1,1.0,male,49.0,1.0,1.0,110.8833,C,"Haverford, PA","home.dest (Perc.: 0.0): Haverford, PA\nfare (C..."
2,1.0,male,24.0,1.0,0.0,82.2667,S,"Minneapolis, MN","home.dest (Perc.: 0.0): Minneapolis, MN\nfare ..."
3,1.0,female,49.0,1.0,0.0,76.7292,C,"New York, NY","home.dest (Perc.: 0.0): New York, NY\nfare (Cu..."
4,3.0,female,10.0,5.0,2.0,46.9000,S,"Wiltshire, England Niagara Falls, NY",sibsp (Cumul.perc.: 1.0): 5.0\nhome.dest (Perc...
...,...,...,...,...,...,...,...,...,...
221,2.0,female,24.0,0.0,0.0,13.0000,S,"London / Montreal, PQ","home.dest (Perc.: 0.0): London / Montreal, PQ\..."
222,1.0,male,50.0,0.0,0.0,26.0000,S,London,home.dest (Perc.: 0.0): London
223,2.0,female,34.0,0.0,0.0,13.0000,S,"Brooklyn, NY","home.dest (Perc.: 0.0): Brooklyn, NY\nsex (Per..."
224,2.0,female,29.0,1.0,0.0,26.0000,S,"Cornwall / Spokane, WA","home.dest (Perc.: 0.0): Cornwall / Spokane, WA..."


In [14]:
for obs, ex in enumerate(df_test.explanation.values):
    #if (obs % 10) == 0:
        print(f'\nObs. {obs}:\n', ex)


Obs. 0:
 home.dest (Perc.: 0.0): Bournmouth, England
parch (Cumul.perc.: 0.97): 2.0

Obs. 1:
 home.dest (Perc.: 0.0): Haverford, PA
fare (Cumul.perc.: 0.92): 110.88

Obs. 2:
 home.dest (Perc.: 0.0): Minneapolis, MN
fare (Cumul.perc.: 0.88): 82.27

Obs. 3:
 home.dest (Perc.: 0.0): New York, NY
fare (Cumul.perc.: 0.85): 76.73
sex (Perc.: 0.0): female

Obs. 4:
 sibsp (Cumul.perc.: 1.0): 5.0
home.dest (Perc.: 0.0): Wiltshire, England Niagara Falls, NY
parch (Cumul.perc.: 0.97): 2.0
pclass (Cumul.perc.: 1.0): 3.0
sex (Perc.: 0.0): female

Obs. 5:
 home.dest (Perc.: 0.0): Sweden Worcester, MA
sibsp (Cumul.perc.: 1.0): 4.0
age (Cumul.perc.: 0.04): 3.0
parch (Cumul.perc.: 0.97): 2.0
pclass (Cumul.perc.: 1.0): 3.0

Obs. 6:
 home.dest (Perc.: 0.0): Warwick, England

Obs. 7:
 home.dest (Perc.: 0.0): Huntington, WV

Obs. 8:
 home.dest (Perc.: 0.0): Paris, France
sex (Perc.: 0.0): female

Obs. 9:
 embarked (Perc.: 0.0): Q
home.dest (Perc.: 0.0): Co Cork, Ireland Charlestown, MA
pclass (Cumul.perc.