In [1]:
import pandas as pandas
import numpy as numpy

data = pandas.read_csv("data/data.csv")
data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


A good starting point for any analysis is to become familiar with the information your dataset contains. If you haven't done so, please read the data point definitions in my paper to better understand what  In this case, we've got a number of 'object'-type attributes like ticket, fare, cabin, etc. The object datatype essentially means that it can be one of many datatypes (numerical, categorical, string of text), so it doesn't tell us too much. The other datatype is int64, which is a whole number (e.g. 1, 2, 3, 500). 

In [2]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


By looking at the first few observations of our dataset, we can start assessing what needs to be done in order to train our binary classification model. A few issues that can be seen in these first few observation: Missing data is indicated with a '?' character that wouldn't make sense to most algorithms and the body property seems to be missing a lot of values (although this is an assumption).

Let's test this assumption by returning a list of objects that are empty. To do so, we will first have to replace the '?' character (that indicates a missing value in this dataset for some reason) to type NaN, the type that most algorithms associate with a missing value (quite similar to null). We can then use a built-in pandas method that returns a list of all properties and how many null values they have. 


In [3]:
# Replace the '?' character with a numpy
# nan value in order to determine how many
# values are left empty
data.replace('?', numpy.nan, inplace= True)

# Return the amount of null (empty)
# values per column 
print(data.isnull().sum())

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


As we can see, there's a few values that are missing from this dataset: Some age values, and a massive part of the cabin, boat, body and home.dest values. As a rule of thumb, if more than 60% of a property's data is missing, we drop it. If it is less than that, we can attempt to predict or correct them. 

In this step, we transform categorical data (gender: male, female) to a numerical representation (male = 0, female = 1). We call this codification, and it necessary because our binary classifier doesn't deal with textual data, but rather with numbers. We then predict some missing values for age, embarked and fare. We will not yet drop any columns. 

In [4]:

# We update the age and fare attributes of the dataset from type 
# 'object' to type 'float64', which indicates it is (possibly) a 
# fractional number (e.g. 1.2, 3.45, but also 5 or 20). This step makes
# calculations further along the way easier. 
data = data.astype({"age": numpy.float64, "fare": numpy.float64})

# We fill in missing data points from the properties age, embarked
# and fare. We use a the median of age and fare because they are numerical,
# and use the mode (most common category) for the embarked property. 
data['age'].fillna(data['age'].median(), inplace = True)
data['embarked'].fillna(data['embarked'].mode()[0], inplace = True)
data['fare'].fillna(data['fare'].median(), inplace = True)


# Because gender only consists of either 'male' or 'female', it makes sense
# to replace it with a binary value that makes classification easier. 
data.replace({'male': 1, 'female': 0}, inplace=True)
data.head()


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",0,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",1,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",0,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


As becomes clear from the samples above, we have successfully transformed our data to a numerical representation (save for tables that we will likely drop later on). 

It is now time to determine which of our properties hold a correlation to our classification goal: determine whether someone survived.  

In [5]:
# Return a list of dataset properties and their correlation with the
# property passed into the method (in this case the 'survived' property)
data.corr().abs()[["survived"]]

Unnamed: 0,survived
pclass,0.312469
survived,1.0
sex,0.528693
age,0.043818
sibsp,0.027825
parch,0.08266
fare,0.244414


What each value tells us is the degree of correlation. In this case, the sex of a passenger has a significant correlation towards the 'survived' property. Another important factor is the class in which the passenger was traveling (1st, 2nd, 3d class), and how much they paid for their trip (the fare).

This is extremely valuable information, because it gives us an idea which parts of our dataset will matter for solving our specific problem (survivability). It will also help with the next step of training our classification model: feature engineering. In feature engineering, we use domain knowledge to create new dataset features (a.k.a. properties) in order to boost our model results. 

A good example comes from combining the sibling/spouses feature with the parch property. We create a new feature called 'relatives', that contains the absolute amount of relatives a person has on board.


In [6]:

# Combine the
data['relatives'] = data.apply (lambda row: int((row['sibsp'] + row['parch']) > 0), axis=1)
data.corr().abs()[["survived"]]


Unnamed: 0,survived
pclass,0.312469
survived,1.0
sex,0.528693
age,0.043818
sibsp,0.027825
parch,0.08266
fare,0.244414
relatives,0.201719


After creating our new feature (named 'relatives'), the next step is to assess how useful it is in classifying survivability. In the correlation index above, we can see that our new 'relatives' feature correlates much more strongly with the 'survived' property than the 'sibsp' and 'parch' combined. 

In [7]:

# Assign a sub-set of our data as main data (we remove 'sibsp' and 'parch' ) and drop all
# numpy NaN values so that we are left with a clean dataset
data = data[['sex', 'pclass','age','relatives','fare','survived']].dropna()
data.corr().abs()[["survived"]]

Unnamed: 0,survived
sex,0.528693
pclass,0.312469
age,0.043818
relatives,0.201719
fare,0.244414
survived,1.0


In [8]:
# Dataset is now ready for machine learning, import
# required libraries
from sklearn.model_selection import train_test_split

# Create four variables and assign all data equally
# Two are of training, two are for testing (to validate accuracy)
x_train, x_test, y_train, y_test = train_test_split(data[['sex','pclass','age','relatives','fare']], data.survived, test_size=0.2, random_state=0)




In [9]:
# Normalize all inputs so that all data
# is scaled equally
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Create the two variables with normalised data
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)



In [10]:
# Could use a neural net:  https://code.visualstudio.com/docs/python/data-science-tutorial
# There's various ML algorithms to apply, in order to help
# you choose there's a chart to help you choose here: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

from sklearn.model_selection import train_test_split
from sklearn import svm, tree
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

classifiers = []
model = GaussianNB()
classifiers.append(model)

model2 = svm.SVC()
classifiers.append(model2)

model3 = tree.DecisionTreeClassifier()
classifiers.append(model3)

model4 = RandomForestClassifier()
classifiers.append(model4)



In [11]:
for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of %s is %s"%(clf, acc))
    cm = confusion_matrix(y_test, y_pred)
    #print("Confusion Matrix of %s is %s"%(clf, cm))

Accuracy of GaussianNB(priors=None, var_smoothing=1e-09) is 0.7748091603053435
Accuracy of SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) is 0.7824427480916031
Accuracy of DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best') is 0.7519083969465649
Accuracy of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
             