In [83]:
#import the titanic.csv file in order to make a decision tree

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#load the data
titanic = pd.read_csv('titanic.csv')

#check the head of the data
print(titanic.head())

   id pclass    age   sex survived
0   1    1st  adult  male      yes
1   2    1st  adult  male      yes
2   3    1st  adult  male      yes
3   4    1st  adult  male      yes
4   5    1st  adult  male      yes


In [84]:
#for each attribute, count the number of each possible value

print(titanic['survived'].value_counts())
print()
print(titanic['pclass'].value_counts())
print()
print(titanic['age'].value_counts())
print()
print(titanic['sex'].value_counts())

survived
no     1490
yes     711
Name: count, dtype: int64

pclass
crew    885
3rd     706
1st     325
2nd     285
Name: count, dtype: int64

age
adult    2092
child     109
Name: count, dtype: int64

sex
male      1731
female     470
Name: count, dtype: int64


In [90]:
#compute the entropy after splitting the data according to each attribute
#for a decision tree intended to determine whether they survived or not

#compute the entropy of the target attribute
#input is an array of tuples, where each tuple contains the number of survivors and non-survivors
#based on the splits of the data on the attribute
def info_func(p_tuples):
    total = 0
    for tuple in p_tuples:
        total += sum(tuple) 

    info = 0
    entropy = 0

    for tuple in p_tuples:
        for val in tuple:
            p = val/sum(tuple)

            if 0 in tuple:
                entropy = 0
                continue
            else:
                entropy -= p*np.log2(p)
        
        info += (sum(tuple)/total)*entropy
        entropy = 0

    return info

In [86]:
#compute info on pclass:
counts = titanic[['pclass', 'survived']].value_counts()
print(counts, '\n')

print('Entropy after pclass split:')
print(info_func([[counts['crew']['yes'], counts['crew']['no']],
                 [counts['1st']['yes'], counts['1st']['no']],
                 [counts['2nd']['yes'], counts['2nd']['no']],
                 [counts['3rd']['yes'], counts['3rd']['no']]]), '\n')

pclass  survived
crew    no          673
3rd     no          528
crew    yes         212
1st     yes         203
3rd     yes         178
2nd     no          167
1st     no          122
2nd     yes         118
Name: count, dtype: int64 

Entropy after pclass split:
0.8483634692722222 



In [87]:
#compute info on age

counts = titanic[['age', 'survived']].value_counts()
print(counts, '\n')

print('Entropy after age split:')
print(info_func([[counts['adult']['yes'], counts['adult']['no']],
                 [counts['child']['yes'], counts['child']['no']]]), '\n')

age    survived
adult  no          1438
       yes          654
child  yes           57
       no            52
Name: count, dtype: int64 

Entropy after age split:
0.9012406875470709 



In [88]:
#compute info on sex
counts = titanic[['sex', 'survived']].value_counts()
print(counts, '\n')

print('Entropy after sex split:')
print(info_func([[counts['male']['yes'], counts['male']['no']],
                 [counts['female']['yes'], counts['female']['no']]]), '\n')


sex     survived
male    no          1364
        yes          367
female  yes          344
        no           126
Name: count, dtype: int64 

Entropy after sex split:
0.7652602113304224 



As seen above, the lowest info score was from the 'sex' attribute, and so that will be the first split performed:

titanic_male = titanic['male]

In [96]:
titanic_male = titanic[titanic['sex'] == 'male']
titanic_female = titanic[titanic['sex'] == 'female']

In [101]:
# after split to male, which attribute is next best:
counts = titanic_male[['pclass', 'survived']].value_counts()

print('Entropy on sex==male after pclass split:')
print(info_func([[counts['crew']['yes'], counts['crew']['no']],
                 [counts['1st']['yes'], counts['1st']['no']],
                 [counts['2nd']['yes'], counts['2nd']['no']],
                 [counts['3rd']['yes'], counts['3rd']['no']]]), '\n')

counts = titanic_male[['age', 'survived']].value_counts()

print('Entropy on sex==male after age split:')
print(info_func([[counts['adult']['yes'], counts['adult']['no']],
                 [counts['child']['yes'], counts['child']['no']]]), '\n')

Entropy on sex==male after pclass split:
0.7334350137077876 

Entropy on sex==male after age split:
0.7372563536552104 



Therefore the next best attribute to split on when sex==male is pclass, which narrowly beats the age attribute

In [103]:
# after split to female, which attribute is next best:
counts = titanic_female[['pclass', 'survived']].value_counts()

print('Entropy on sex==female after pclass split:')
print(info_func([[counts['crew']['yes'], counts['crew']['no']],
                 [counts['1st']['yes'], counts['1st']['no']],
                 [counts['2nd']['yes'], counts['2nd']['no']],
                 [counts['3rd']['yes'], counts['3rd']['no']]]), '\n')

counts = titanic_female[['age', 'survived']].value_counts()

print('Entropy on sex==female after age split:')
print(info_func([[counts['adult']['yes'], counts['adult']['no']],
                 [counts['child']['yes'], counts['child']['no']]]), '\n')

Entropy on sex==female after pclass split:
0.6196328041731174 

Entropy on sex==female after age split:
0.8343071565467435 



Therefore the next best attribute to split on when sex==female is pclass, which beats the age attribute by a fair margin

Therefore the tree will look like the following, with words in [] being nodes, and words without being the edges

                            ['sex']
                           /       \
                        female      male
                         /            \
                   ['pclass']       ['pclass']