In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data = pd.read_csv("./titanic.csv")

In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [4]:
data.shape

(1009, 14)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1009 non-null   float64
 1   survived   1009 non-null   float64
 2   name       1009 non-null   object 
 3   sex        1009 non-null   object 
 4   age        812 non-null    float64
 5   sibsp      1009 non-null   float64
 6   parch      1009 non-null   float64
 7   ticket     1009 non-null   object 
 8   fare       1008 non-null   float64
 9   cabin      229 non-null    object 
 10  embarked   1008 non-null   object 
 11  boat       374 non-null    object 
 12  body       98 non-null     float64
 13  home.dest  582 non-null    object 
dtypes: float64(7), object(7)
memory usage: 110.5+ KB


In [6]:
columns_to_drop = ['name',  'ticket', 'cabin', 'embarked','boat', 'body', 'home.dest']

In [7]:
data = data.drop(columns=columns_to_drop)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   object 
 3   age       812 non-null    float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1008 non-null   float64
dtypes: float64(6), object(1)
memory usage: 55.3+ KB


In [9]:
# labelencoder alternative
data['sex'].astype('category').cat.codes

0       0
1       1
2       0
3       0
4       0
       ..
1004    1
1005    0
1006    0
1007    1
1008    1
Length: 1009, dtype: int8

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['sex'] = le.fit_transform(data['sex'])

In [11]:
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,,2.0,0.0,23.25


In [12]:
data.fillna(round(data['age'].mean(), 1), inplace=True)

In [13]:
# round(data['age'].mean(), 1)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   int64  
 3   age       1009 non-null   float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1009 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 55.3 KB


In [15]:
data

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.8,0.0,0.0,7.7500
1,2.0,0.0,1,39.0,0.0,0.0,26.0000
2,2.0,1.0,0,40.0,0.0,0.0,13.0000
3,3.0,1.0,0,31.0,1.0,1.0,20.5250
4,3.0,1.0,0,29.8,2.0,0.0,23.2500
...,...,...,...,...,...,...,...
1004,1.0,1.0,1,40.0,0.0,0.0,31.0000
1005,3.0,0.0,0,37.0,0.0,0.0,9.5875
1006,1.0,1.0,0,23.0,1.0,0.0,113.2750
1007,3.0,1.0,1,12.0,1.0,0.0,11.2417


In [17]:
input_cols = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']
output_cols = ['survived']

X = data[input_cols]
Y = data[output_cols]

print(X.shape, Y.shape)

(1009, 6) (1009, 1)


In [29]:
def entropy(col):
    counts = np.unique(col, return_counts=True)
    N = float(col.shape[0])
    
    ent = 0.0
    
    for i in counts[1]:
        p = i/N
        ent += (-1.0*p*np.log2(p))
        
    return ent

In [38]:
X.shape[0]

1009

In [40]:
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.8,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,29.8,2.0,0.0,23.25


In [41]:
def divide_data(data,fkey,fval):
    right = pd.DataFrame([], columns= data.columns) 
    left = pd.DataFrame([], columns= data.columns)
    
    
    for i in range(data.shape[0]):
        val = data[fkey].iloc[i]
        
        if val >fval:
            right = right.append(data.iloc[i])
        else:
            left = left.append(data.iloc[i])
            
    return left, right

In [54]:
# left, right = divide_data(data[:10], 'sex', 0.5)
# print(left)
# print(right)

In [58]:
def information_gain(data, fkey, fval):
    left, right = divide_data(data, fkey, fval)
    
    l = left.shape[0]/data.shape[0]
    r = right.shape[0]/data.shape[0]
    
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -1000 # minimum information gain
    
    info_gain = entropy(data['survived']) - ( l*entropy(left['survived']) + r*entropy(right['survived']) )
    return info_gain

In [65]:
for col in X.columns:
    print(col)
    print(information_gain(data, col, data[col].mean()))

pclass
0.055456910002982474
sex
0.19274737190850932
age
0.001955929827451075
sibsp
0.006492394392888956
parch
0.01975608012294816
fare
0.04242793401428169


### Decision Tree Class

In [78]:
class DecisionTree:
    
    # constructor
    def __init__(self, depth=0, max_depth = 5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None # o/p var
        
    def train(self, data):
        features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']
        info_gain = []
        
        for f in features:
            i_g = information_gain(data, f, data[f].mean())
            info_gain.append(i_g)
            
        self.fkey = features[np.argmax(info_gain)]
        self.fval = data[self.fkey].mean()
        
        print("DT is choosing feature - ", self.fkey)
        
        # split the data
        left, right = divide_data(data, self.fkey, self.fval)
        
        
        
        # ====================================
        #       STOPPING CONDITIONS
        # ====================================
        
        # CASE 1 - WHEN NODE IS PURE
        if left.shape[0] == 0 or right.shape[0] == 0:
            if data['survived'].mean() >=0.5:
                self.target = 'survived'

            else:
                self.target = 'dead'
        
            return 
        
        
        # CASE II - WHEN MAX_DEPTH IS REACHED
        if (self.depth >= self.max_depth):
            if data['survived'].mean() >=0.5:
                self.target = 'survived'

            else:
                self.target = 'dead'
        
            return 
        
        
        # ====================================
        #       RECURSION CASE
        # ====================================
        
        self.left = DecisionTree(depth= self.depth+1, max_depth= self.max_depth)        
        self.left.train(left)
        
        self.right = DecisionTree(depth= self.depth+1 , max_depth= self.max_depth)
        self.right.train(right)
        
    
    
    def predict(self, test):
        if test[self.fkey]> self.fval:
            # go to right
                
            if self.right is None:
                # this is lead Node
                return self.target 
            return self.right.predict(test)
            
        else:
            # go to left
            if self.left is None:
                return self.target
            return self.left.predict(test)
        

In [79]:
split = int(0.7*1009)

train_data = data[:split]
test_data = data[split:]

In [80]:
train_data.shape

(706, 7)

In [81]:
test_data.shape

(303, 7)

In [82]:
model = DecisionTree(max_depth=3)

In [83]:
model.train(train_data)

DT is choosing feature -  sex
DT is choosing feature -  pclass
DT is choosing feature -  parch
DT is choosing feature -  fare
DT is choosing feature -  fare
DT is choosing feature -  sibsp
DT is choosing feature -  fare
DT is choosing feature -  fare
DT is choosing feature -  fare
DT is choosing feature -  parch
DT is choosing feature -  sibsp
DT is choosing feature -  fare
DT is choosing feature -  fare
DT is choosing feature -  age
DT is choosing feature -  pclass


In [91]:
print(model.fkey)
print(model.fval)
print(model.left.fkey)
print(model.left.fval)
print(model.right.fkey)
print(model.right.fval)

sex
0.6543909348441926
pclass
2.1475409836065573
fare
26.18499199134197


In [98]:
test_data.iloc[1]

pclass       2.0
survived     1.0
sex          0.0
age         18.0
sibsp        0.0
parch        1.0
fare        23.0
Name: 707, dtype: float64

In [102]:
predictions = []
for i in range(test_data.shape[0]):
    p = model.predict(test_data.iloc[i])
    predictions.append(p)

In [105]:
le = LabelEncoder()
predictions = le.fit_transform(predictions)

In [106]:
predictions

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,

In [111]:
y_test = test_data['survived'].astype('int').values

In [114]:
# Accuracy
np.sum(predictions == y_test)/len(y_test)

0.759075907590759

## Sklearn Decision Tree

In [153]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [164]:
sk_tree = DecisionTreeClassifier(criterion='entropy')

In [165]:
sk_tree.fit(train_data[input_cols], train_data[output_cols])

DecisionTreeClassifier(criterion='entropy')

In [166]:
sk_tree_prediction = sk_tree.predict(test_data[input_cols]).astype('int')

### Scores

In [167]:
sk_tree.score(train_data[input_cols], train_data[output_cols])

0.9773371104815864

In [168]:
sk_tree.score(test_data[input_cols], test_data[output_cols])

0.7458745874587459

In [169]:
sk_tree.feature_importances_

array([0.10189117, 0.23460211, 0.26847122, 0.03818108, 0.0459865 ,
       0.31086792])

In [170]:
input_cols

['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']