Imports

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from interpret.blackbox import LimeTabular
from interpret import show
import pandas as pd

pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

Load and preprocess data

In [2]:
# Original kaggle data; cabin encodes deck and room
kaggle_titanic = pd.read_csv('original_kaggle_titanic.csv')
kaggle_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Adapted data, deck is taken from the cabin information but the room information is dropped
titanic = pd.read_csv('processed_titanic.csv')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [4]:
# Load dataset
class DataLoader:
    def __init__(self, dataset_name, include_cabin=True):
        """A lot of cabin information is missing which is why many cases
        are excluded if the cabin information stays in the dataset.
        Standardization is not necessary for the RandomForest we use later."""
        self.data = None
        if dataset_name not in ["original_kaggle_titanic", "processed_titanic"]:
            raise NameError("The dataset_name must be either original_kaggle_titanic or processed_titanic!")
        self.cabin = include_cabin
        self.dataset = dataset_name

    def load_dataset(self):
        self.data = pd.read_csv(self.dataset + ".csv")

    def preprocess_data(self):

        # Drop irrelevant columns
        if self.dataset == 'original_kaggle_titanic':
            self.data.drop(["PassengerId", 'Ticket', 'Name'], axis=1, inplace=True)
            if not self.cabin:
                self.data.drop(['Cabin'], axis=1, inplace=True)
        elif self.dataset == 'processed_titanic':
            self.data.drop(["alive", 'pclass', 'embarked', 'adult_male', 'who'], axis=1,
                           inplace=True)  #who is woman, man, child
            if not self.cabin:
                self.data.drop(['deck'], axis=1, inplace=True)

        # Drop all missing values
        self.data = self.data.dropna(axis=0)

        # Reorder columns to move survived to the end
        cols = self.data.columns.tolist()
        cols.append(cols.pop(0))  # Survived is at index 0 after dropping the PassengerId
        self.data = self.data[cols]

        # One-hot encode all categorical columns
        if self.dataset == 'original_kaggle_titanic':
            self.data = self.data.astype({'Pclass': 'category'})
            categorical_cols = ["Pclass", "Sex", "Embarked"]
            if self.cabin:
                categorical_cols.append('Cabin')
                self.data = self.data.astype({'Cabin': 'category'})

        elif self.dataset == 'processed_titanic':
            categorical_cols = ["sex", "class", "embark_town", "alone"]
            self.data = self.data.astype({'sex': 'category', 'class': 'category',
                                          'embark_town': 'category',
                                          'alone': 'category'})
            if self.cabin:
                categorical_cols.append("deck")
                self.data = self.data.astype({'deck': 'category'})
        encoded = pd.get_dummies(self.data[categorical_cols],
                                 prefix=categorical_cols)

        # Update data with new columns
        self.data = pd.concat([encoded, self.data], axis=1)
        self.data.drop(categorical_cols, axis=1, inplace=True)

    def get_data_split(self):
        X = self.data.iloc[:, :-1]
        y = self.data.iloc[:, -1]
        return train_test_split(X, y, test_size=0.2, random_state=2021)

    def oversample(self, X_train, y_train):
        oversample = RandomOverSampler(sampling_strategy='minority')
        # Convert to numpy and oversample
        x_np = X_train.to_numpy()
        y_np = y_train.to_numpy()
        x_np, y_np = oversample.fit_resample(x_np, y_np)
        # Convert back to pandas
        x_over = pd.DataFrame(x_np, columns=X_train.columns)
        y_over = pd.Series(y_np, name=y_train.name)
        return x_over, y_over

Load data, split the data for evaluation, oversample and fit a blackbox random forest model

In [5]:
def forest(dataset_name, include_cabin=True):
    """Load data, split the data for evaluation, oversample and fit a blackbox random forest model."""
    data_loader = DataLoader(dataset_name=dataset_name,
                             include_cabin=include_cabin)  #original_kaggle_titanic or processed_titanic
    data_loader.load_dataset()
    data_loader.preprocess_data()

    # Split the data for evaluation
    X_train, X_test, y_train, y_test = data_loader.get_data_split()

    # Oversample the data to account for minorities
    X_train, y_train = data_loader.oversample(X_train, y_train)

    # Fit blackbox model random forest
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, rf.predict(X_test))
    f1 = f1_score(y_test, y_pred, average='macro')

    return rf.predict_proba, X_train, X_test, y_train, y_test, X_train.shape, X_test.shape, f1, accuracy

Training the random forests and writing their features in an overall DataFrame

In [6]:
forest_training = pd.DataFrame(
    index=['original cabin', 'original no_cabin', 'preprocessed cabin', 'preprocessed no_cabin'],
    columns=['prediction probabilities', 'X_train_data', 'X_test_data', 'y_train_data',
             'y_test_data', 'X_train shape', 'X_test shape', 'F1 Score', 'Accuracy'])

forest_training.loc['original cabin'] = forest(
    dataset_name='original_kaggle_titanic',
    include_cabin=True)
forest_training.loc['original no_cabin'] = forest(
    dataset_name='original_kaggle_titanic',
    include_cabin=False)
forest_training.loc['preprocessed cabin'] = forest(
    dataset_name='processed_titanic',
    include_cabin=True)
forest_training.loc['preprocessed no_cabin'] = forest(
    dataset_name='processed_titanic',
    include_cabin=False)

# Print shape, f1 and accuracy scores
forest_training[['X_train shape', 'X_test shape', 'F1 Score', 'Accuracy']]

  arr_value = np.asarray(value)
  arr_value = np.asarray(value)
  arr_value = np.asarray(value)
  arr_value = np.asarray(value)


Unnamed: 0,X_train shape,X_test shape,F1 Score,Accuracy
original cabin,"(194, 145)","(37, 145)",0.741259,0.783784
original no_cabin,"(688, 12)","(143, 12)",0.787876,0.79021
preprocessed cabin,"(198, 21)","(37, 21)",0.716113,0.756757
preprocessed no_cabin,"(688, 14)","(143, 14)",0.794632,0.797203


LIME

In [7]:
for index, row in forest_training.iterrows():
    # Initialize Lime for Tabular data
    # Train dataset is needed to create perturbations to create the new data

    forest_training.loc[index, 'lime'] = LimeTabular(
        predict_fn=forest_training.loc[index, 'prediction probabilities'],
        data=forest_training.loc[index, 'X_train_data'],
        random_state=1)

    # Get local explanations by fitting new models
    forest_training.loc[index, 'lime_local'] = forest_training.loc[index, 'lime'].explain_local(
        forest_training.loc[index, 'X_test_data'][-4:],
        forest_training.loc[index, 'y_test_data'][-4:],
        name='LIME')



Visualize the LIME results using the interpret library. Choose one of 'original cabin', 'original no_cabin', 'preprocessed cabin' or 'preprocessed no_cabin' as index in the loc to
 the results for the respective model.

In [8]:
show(forest_training.loc['original cabin', 'lime_local'])  # comes from interpret library

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt


In [9]:
show(forest_training.loc['original no_cabin', 'lime_local'])  # comes from interpret library

In [10]:
show(forest_training.loc['preprocessed cabin', 'lime_local'])  # comes from interpret library

In [11]:
show(forest_training.loc['preprocessed no_cabin', 'lime_local'])  # comes from interpret library