This notebook contains a personnal study of naive bayes methods applied to the Titanic classification problem.

Links:
- www.kaggle.com/c/titanic

# Notebook Setup

In [1]:
import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
print("Library import and setup Complete")

Library import and setup Complete


In [2]:
rng = np.random.RandomState(42)
print("Prayer to RNGesus Sent")

Prayer to RNGesus Sent


# Loading the Datasets

In [3]:
train_data = pd.read_csv("../input/titanic/train.csv", index_col="PassengerId")
test_data = pd.read_csv("../input/titanic/test.csv", index_col="PassengerId")

train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Preprocessing

## Predictors Selection

In [4]:
# Separate target from predictors
target = "Survived"
y = train_data[target]
X_full = train_data.drop([target], axis=1)


# Drop columns with missing values (simplest approach)
cols_with_na = [col for col in X_full.columns if X_full[col].isnull().any()]
X_reduced = X_full.drop(cols_with_na, axis=1)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
def has_low_cardinality(df, col):
    is_categorical = (df[col].dtype == "object")
    has_low_card = (df[col].nunique() < 10)
    return (is_categorical and has_low_card)

categorical_cols = [col for col in X_reduced.columns if has_low_cardinality(X_reduced, col)]

# Select numerical columns
numerical_cols = [col for col in X_reduced.columns if X_reduced[col].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols

X = X_reduced[my_cols].copy()
X_test = test_data[my_cols].copy()

print("Predictors are : ", my_cols)
print("Numerical cols : ", numerical_cols)
print("Categorical cols : ", categorical_cols)


Predictors are :  ['Sex', 'Pclass', 'SibSp', 'Parch', 'Fare']
Numerical cols :  ['Pclass', 'SibSp', 'Parch', 'Fare']
Categorical cols :  ['Sex']


In [5]:
X.head()

Unnamed: 0_level_0,Sex,Pclass,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,male,3,1,0,7.25
2,female,1,1,0,71.2833
3,female,3,0,0,7.925
4,female,1,1,0,53.1
5,male,3,0,0,8.05


## Preprocessing Pipeline

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
     ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [7]:
# for data validation
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.naive_bayes import GaussianNB
# Gaussian Naive-Bayes with no calibration
clf = GaussianNB()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', clf)
])


from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

scores = cross_val_score(
    pipeline, X, y,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)
# print(f"Using cross-validation = {scores}")

score_mean = scores.mean()
score_std = scores.std()

print(f"Score mean = {score_mean} +- {score_std/(scores.size)**0.5}")
print(f"Score variance = {score_std}")


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Score mean = 0.7948589263420723 +- 0.003609316993511247
Score variance = 0.03609316993511247


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished


In [8]:
score_clf = accuracy_score(y_test, preds_clf)
print(f"Model has score = {score_clf}")

NameError: name 'y_test' is not defined

In [None]:
preds_clf