# Preprocessing Titanic Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv("train.csv")

In [3]:
data.head(n=10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
columns_to_drop=["PassengerId","Name","Ticket","Cabin","Embarked"]
data_clean=data.drop(columns_to_drop,axis=1)

In [6]:
data_clean.head(n=5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [7]:
#Convert Sex column(categorical data) to Numerical data
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [8]:
data_clean["Sex"]=le.fit_transform(data_clean["Sex"])

In [9]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [10]:
#Fill all the NAN entries in Age column with mean value
data_clean=data_clean.fillna(data_clean["Age"].mean())
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


In [11]:
data_clean.loc[2] #data in 2nd row

Survived     1.000
Pclass       3.000
Sex          0.000
Age         26.000
SibSp        0.000
Parch        0.000
Fare         7.925
Name: 2, dtype: float64

In [12]:
input_cols=["Pclass","Sex","Age","SibSp","Parch","Fare"]
output_cols=["Survived"] #Gives binary prediction

In [13]:
X=data_clean[input_cols]
Y=data_clean[output_cols]

In [14]:
print(X.shape,Y.shape)

(891, 6) (891, 1)


In [15]:
#Define Entropy and Information Gain
def entropy(col):
    counts=np.unique(col,return_counts=True)
    
    N=float(col.shape[0]) #Total entries in that columns
    print(counts)
    ent=0.0
    for ix in counts[1]: #Iterate over Frequency
        p=ix/N
        ent+=(-1.0*p*np.log2(p))
    return ent

In [16]:
col=np.array([1,1,1,1,0,1,0])
entropy(col)

(array([0, 1]), array([2, 5], dtype=int64))


0.863120568566631

In [17]:
def divide_data(x_data,fkey,fval):
    x_right=pd.DataFrame([],columns=x_data.columns) #data is empty list
    x_left=pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val=x_data[fkey].loc[ix]
        
        if val>fval:
            x_right=x_right.append(x_data.loc[ix])
            
        else:
            x_left=x_left.append(x_data.loc[ix])
            
    return x_left,x_right

In [21]:
x_left,x_right=divide_data(data_clean[:10],'Sex',0.5)
print(x_left)
print(x_right)
#We got two tables due to data splitting around fkey(1 for male and 0 for female)

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare
1       1.0     1.0  0.0  38.0    1.0    0.0  71.2833
2       1.0     3.0  0.0  26.0    0.0    0.0   7.9250
3       1.0     1.0  0.0  35.0    1.0    0.0  53.1000
8       1.0     3.0  0.0  27.0    0.0    2.0  11.1333
9       1.0     2.0  0.0  14.0    1.0    0.0  30.0708
   Survived  Pclass  Sex        Age  SibSp  Parch     Fare
0       0.0     3.0  1.0  22.000000    1.0    0.0   7.2500
4       0.0     3.0  1.0  35.000000    0.0    0.0   8.0500
5       0.0     3.0  1.0  29.699118    0.0    0.0   8.4583
6       0.0     1.0  1.0  54.000000    0.0    0.0  51.8625
7       0.0     3.0  1.0   2.000000    3.0    1.0  21.0750


In [22]:
def information_gain(x_data,fkey,fval):
    left,right=divide_data(x_data,fkey,fval)
    
    # % of total samples on left and right are
    l=float(left.shape[0])/x_data.shape[0]
    r=float(right.shape[0])/x_data.shape[0]
    
    #All examples come to one side if reached leaf node or no examples left
    if left.shape[0]==0 or right.shape[0]==0:
        return -100000 #Min Information Gain
    
    i_gain=entropy(x_data.Survived)-(l*entropy(left.Survived)+r*entropy(right.Survived))
    
    return i_gain

In [23]:
#Test Function
for fx in X.columns:
    print(fx)
    print(information_gain(data_clean,fx,data_clean[fx].mean()))

Pclass
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([177, 223], dtype=int64))
(array([0., 1.]), array([372, 119], dtype=int64))
0.07579362743608165
Sex
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([ 81, 233], dtype=int64))
(array([0., 1.]), array([468, 109], dtype=int64))
0.2176601066606142
Age
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([228, 156], dtype=int64))
(array([0., 1.]), array([321, 186], dtype=int64))
0.001158644038169343
SibSp
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([398, 210], dtype=int64))
(array([0., 1.]), array([151, 132], dtype=int64))
0.009584541813400071
Parch
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([445, 233], dtype=int64))
(array([0., 1.]), array([104, 109], dtype=int64))
0.015380754493137694
Fare
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))


In [24]:
 class DecisionTree:
    
    #Constructor
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self,X_train):
        
        features = ['Pclass','Sex','Age','SibSp', 'Parch', 'Fare']
        info_gains = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = X_train[self.fkey].mean()
        print("Making Tree Features is",self.fkey)
        
        #Split Data
        data_left,data_right = divide_data(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
         
        #Truly a left node
        if data_left.shape[0]  == 0 or data_right.shape[0] ==0:
            if X_train.Survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth):
            if X_train.Survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        #You can set the target at every node
        if X_train.Survived.mean() >= 0.5:
            self.target = "Survive"
        else:
            self.target = "Dead"
        return
    
    def predict(self,test):
        if test[self.fkey]>self.fval:
            #go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

                 

In [25]:
d=DecisionTree()
d.train(data_clean)

(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([177, 223], dtype=int64))
(array([0., 1.]), array([372, 119], dtype=int64))
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([ 81, 233], dtype=int64))
(array([0., 1.]), array([468, 109], dtype=int64))
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([228, 156], dtype=int64))
(array([0., 1.]), array([321, 186], dtype=int64))
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([398, 210], dtype=int64))
(array([0., 1.]), array([151, 132], dtype=int64))
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([445, 233], dtype=int64))
(array([0., 1.]), array([104, 109], dtype=int64))
(array([0, 1], dtype=int64), array([549, 342], dtype=int64))
(array([0., 1.]), array([464, 216], dtype=int64))
(array([0., 1.]), array([ 85, 126], dtype=int64))
Making Tree Features is Sex
(array

(array([1.]), array([17], dtype=int64))
(array([1.]), array([9], dtype=int64))
(array([1.]), array([8], dtype=int64))
(array([1.]), array([17], dtype=int64))
(array([1.]), array([7], dtype=int64))
(array([1.]), array([10], dtype=int64))
(array([1.]), array([17], dtype=int64))
(array([1.]), array([8], dtype=int64))
(array([1.]), array([9], dtype=int64))
(array([1.]), array([17], dtype=int64))
(array([1.]), array([12], dtype=int64))
(array([1.]), array([5], dtype=int64))
Making Tree Features is Age
(array([0., 1.]), array([ 1, 13], dtype=int64))
(array([0., 1.]), array([1, 7], dtype=int64))
(array([1.]), array([6], dtype=int64))
(array([0., 1.]), array([ 1, 13], dtype=int64))
(array([1.]), array([5], dtype=int64))
(array([0., 1.]), array([1, 8], dtype=int64))
(array([0., 1.]), array([ 1, 13], dtype=int64))
(array([0., 1.]), array([1, 9], dtype=int64))
(array([1.]), array([4], dtype=int64))
(array([0., 1.]), array([ 1, 13], dtype=int64))
(array([0., 1.]), array([1, 9], dtype=int64))
(arra

(array([0.]), array([7], dtype=int64))
(array([0.]), array([1], dtype=int64))
(array([0.]), array([6], dtype=int64))
(array([0.]), array([7], dtype=int64))
(array([0.]), array([4], dtype=int64))
(array([0.]), array([3], dtype=int64))
(array([0.]), array([7], dtype=int64))
(array([0.]), array([3], dtype=int64))
(array([0.]), array([4], dtype=int64))
(array([0.]), array([7], dtype=int64))
(array([0.]), array([4], dtype=int64))
(array([0.]), array([3], dtype=int64))
Making Tree Features is Age
(array([0., 1.]), array([468, 109], dtype=int64))
(array([0., 1.]), array([168,  62], dtype=int64))
(array([0., 1.]), array([300,  47], dtype=int64))
(array([0., 1.]), array([468, 109], dtype=int64))
(array([0., 1.]), array([308,  68], dtype=int64))
(array([0., 1.]), array([160,  41], dtype=int64))
(array([0., 1.]), array([468, 109], dtype=int64))
(array([0., 1.]), array([361,  73], dtype=int64))
(array([0., 1.]), array([107,  36], dtype=int64))
(array([0., 1.]), array([468, 109], dtype=int64))
(arr

(array([0., 1.]), array([11,  3], dtype=int64))
(array([0., 1.]), array([9, 3], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0., 1.]), array([11,  3], dtype=int64))
(array([0., 1.]), array([8, 3], dtype=int64))
(array([0.]), array([3], dtype=int64))
(array([0., 1.]), array([11,  3], dtype=int64))
(array([0., 1.]), array([6, 3], dtype=int64))
(array([0.]), array([5], dtype=int64))
Making Tree Features is Age
(array([0., 1.]), array([4, 3], dtype=int64))
(array([0.]), array([1], dtype=int64))
(array([0., 1.]), array([3, 3], dtype=int64))
(array([0., 1.]), array([4, 3], dtype=int64))
(array([0., 1.]), array([2, 1], dtype=int64))
(array([0., 1.]), array([2, 2], dtype=int64))
(array([0., 1.]), array([4, 3], dtype=int64))
(array([0., 1.]), array([2, 3], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0., 1.]), array([4, 3], dtype=int64))
(array([0., 1.]), array([2, 3], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0., 1.]), array([4, 3], dtype=int6

(array([0., 1.]), array([28,  7], dtype=int64))
(array([0., 1.]), array([19,  1], dtype=int64))
(array([0., 1.]), array([9, 6], dtype=int64))
(array([0., 1.]), array([28,  7], dtype=int64))
(array([0., 1.]), array([19,  4], dtype=int64))
(array([0., 1.]), array([9, 3], dtype=int64))
Making Tree Features is Parch
(array([0., 1.]), array([5, 5], dtype=int64))
(array([0., 1.]), array([3, 3], dtype=int64))
(array([0., 1.]), array([2, 2], dtype=int64))
(array([0., 1.]), array([5, 5], dtype=int64))
(array([0., 1.]), array([2, 5], dtype=int64))
(array([0.]), array([3], dtype=int64))
(array([0., 1.]), array([5, 5], dtype=int64))
(array([0., 1.]), array([2, 5], dtype=int64))
(array([0.]), array([3], dtype=int64))
(array([0., 1.]), array([5, 5], dtype=int64))
(array([0.]), array([3], dtype=int64))
(array([0., 1.]), array([2, 5], dtype=int64))
Making Tree Features is SibSp
(array([0., 1.]), array([21,  1], dtype=int64))
(array([0., 1.]), array([14,  1], dtype=int64))
(array([0.]), array([7], dtyp

# Train-Validation-Test Set Split

In [26]:
split=int(0.7*data_clean.shape[0])
train_data=data_clean[:split]
test_data=data_clean[split:]


In [27]:
test_data=test_data.reset_index(drop=True)

In [28]:
print(train_data.shape,test_data.shape)

(623, 7) (268, 7)


In [29]:
dt = DecisionTree()
dt.train(train_data)

(array([0, 1], dtype=int64), array([377, 246], dtype=int64))
(array([0., 1.]), array([123, 157], dtype=int64))
(array([0., 1.]), array([254,  89], dtype=int64))
(array([0, 1], dtype=int64), array([377, 246], dtype=int64))
(array([0., 1.]), array([ 58, 173], dtype=int64))
(array([0., 1.]), array([319,  73], dtype=int64))
(array([0, 1], dtype=int64), array([377, 246], dtype=int64))
(array([0., 1.]), array([245, 151], dtype=int64))
(array([0., 1.]), array([132,  95], dtype=int64))
(array([0, 1], dtype=int64), array([377, 246], dtype=int64))
(array([0., 1.]), array([267, 148], dtype=int64))
(array([0., 1.]), array([110,  98], dtype=int64))
(array([0, 1], dtype=int64), array([377, 246], dtype=int64))
(array([0., 1.]), array([304, 170], dtype=int64))
(array([0., 1.]), array([73, 76], dtype=int64))
(array([0, 1], dtype=int64), array([377, 246], dtype=int64))
(array([0., 1.]), array([318, 161], dtype=int64))
(array([0., 1.]), array([59, 85], dtype=int64))
Making Tree Features is Sex
(array([0.

(array([0., 1.]), array([ 2, 20], dtype=int64))
(array([0., 1.]), array([ 2, 14], dtype=int64))
(array([1.]), array([6], dtype=int64))
(array([0., 1.]), array([ 2, 20], dtype=int64))
(array([0., 1.]), array([ 1, 13], dtype=int64))
(array([0., 1.]), array([1, 7], dtype=int64))
Making Tree Features is Parch
(array([1.]), array([11], dtype=int64))
(array([1.]), array([8], dtype=int64))
(array([1.]), array([3], dtype=int64))
(array([1.]), array([11], dtype=int64))
(array([1.]), array([9], dtype=int64))
(array([1.]), array([2], dtype=int64))
(array([1.]), array([11], dtype=int64))
(array([1.]), array([8], dtype=int64))
(array([1.]), array([3], dtype=int64))
Making Tree Features is Age
(array([1.]), array([23], dtype=int64))
(array([1.]), array([18], dtype=int64))
(array([1.]), array([5], dtype=int64))
(array([1.]), array([23], dtype=int64))
(array([1.]), array([14], dtype=int64))
(array([1.]), array([9], dtype=int64))
(array([1.]), array([23], dtype=int64))
(array([1.]), array([22], dtype=i

(array([0., 1.]), array([9, 2], dtype=int64))
(array([0., 1.]), array([4, 1], dtype=int64))
(array([0., 1.]), array([5, 1], dtype=int64))
(array([0., 1.]), array([9, 2], dtype=int64))
(array([0., 1.]), array([3, 1], dtype=int64))
(array([0., 1.]), array([6, 1], dtype=int64))
(array([0., 1.]), array([9, 2], dtype=int64))
(array([0., 1.]), array([7, 1], dtype=int64))
(array([0., 1.]), array([2, 1], dtype=int64))
(array([0., 1.]), array([9, 2], dtype=int64))
(array([0.]), array([5], dtype=int64))
(array([0., 1.]), array([4, 2], dtype=int64))
Making Tree Features is Fare
(array([0.]), array([4], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0.]), array([4], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0.]), array([4], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0.]), array([2], dtype=int64))
Making Tree Features is Age
(array([0., 1.]), array([319,  73], dtype=int64

(array([0., 1.]), array([2, 5], dtype=int64))
(array([0., 1.]), array([1, 4], dtype=int64))
(array([0., 1.]), array([1, 1], dtype=int64))
(array([0., 1.]), array([2, 5], dtype=int64))
(array([0., 1.]), array([1, 5], dtype=int64))
(array([0.]), array([1], dtype=int64))
(array([0., 1.]), array([2, 5], dtype=int64))
(array([0., 1.]), array([2, 4], dtype=int64))
(array([1.]), array([1], dtype=int64))
(array([0., 1.]), array([2, 5], dtype=int64))
(array([1.]), array([3], dtype=int64))
(array([0., 1.]), array([2, 2], dtype=int64))
Making Tree Features is SibSp
(array([0., 1.]), array([8, 1], dtype=int64))
(array([0., 1.]), array([4, 1], dtype=int64))
(array([0.]), array([4], dtype=int64))
(array([0., 1.]), array([8, 1], dtype=int64))
(array([0.]), array([3], dtype=int64))
(array([0., 1.]), array([5, 1], dtype=int64))
(array([0., 1.]), array([8, 1], dtype=int64))
(array([0., 1.]), array([6, 1], dtype=int64))
(array([0.]), array([2], dtype=int64))
(array([0., 1.]), array([8, 1], dtype=int64))


(array([0., 1.]), array([7, 6], dtype=int64))
(array([0., 1.]), array([5, 5], dtype=int64))
(array([0., 1.]), array([2, 1], dtype=int64))
(array([0., 1.]), array([7, 6], dtype=int64))
(array([1.]), array([5], dtype=int64))
(array([0., 1.]), array([7, 1], dtype=int64))
(array([0., 1.]), array([7, 6], dtype=int64))
(array([0., 1.]), array([4, 2], dtype=int64))
(array([0., 1.]), array([3, 4], dtype=int64))
(array([0., 1.]), array([7, 6], dtype=int64))
(array([0., 1.]), array([5, 1], dtype=int64))
(array([0., 1.]), array([2, 5], dtype=int64))
(array([0., 1.]), array([7, 6], dtype=int64))
(array([0., 1.]), array([3, 5], dtype=int64))
(array([0., 1.]), array([4, 1], dtype=int64))
Making Tree Features is Age
(array([0., 1.]), array([15,  2], dtype=int64))
(array([0., 1.]), array([13,  1], dtype=int64))
(array([0., 1.]), array([2, 1], dtype=int64))
(array([0., 1.]), array([15,  2], dtype=int64))
(array([0., 1.]), array([8, 2], dtype=int64))
(array([0.]), array([7], dtype=int64))
(array([0., 1.

In [30]:
print(dt.fkey)
print(dt.fval)
print(dt.left.fkey)
print(dt.right.fkey)

Sex
0.6292134831460674
Pclass
Fare


In [31]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

In [32]:
y_pred

['Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Survive',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Survive',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'De

In [33]:
y_actual=test_data[output_cols]
print(y_actual)

     Survived
0           0
1           0
2           0
3           0
4           1
..        ...
263         0
264         1
265         0
266         1
267         0

[268 rows x 1 columns]


In [34]:
le=LabelEncoder()
y_pred=le.fit_transform(y_pred)

In [35]:
print(y_pred)

[0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0
 1 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0
 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1
 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 0
 0 0 0 0 0 1 0 0 0]


In [36]:
y_pred=np.array(y_pred).reshape((-1,1))
print(y_pred.shape)
print(y_actual.shape)

(268, 1)
(268, 1)


In [37]:
acc=np.sum(y_pred==y_actual)/y_pred.shape[0]

In [38]:
acc=np.sum(np.array(y_pred)==np.array(y_actual))/y_pred.shape[0]

In [39]:
print(acc)

0.8171641791044776


# Decision Tree using Sklearn

In [40]:
from sklearn.tree import DecisionTreeClassifier
sk_tree=DecisionTreeClassifier(criterion='entropy',max_depth=5)

In [41]:
sk_tree.fit(train_data[input_cols],train_data[output_cols])
sk_tree.predict(test_data[input_cols])

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0], dtype=int64)

In [42]:
sk_tree.score(test_data[input_cols],test_data[output_cols])

0.8283582089552238

# Visualising Decision Tree

In [47]:
import pydotplus

In [48]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz

In [49]:
dot_data = StringIO()
export_graphviz(sk_tree,out_file=dot_data,filled=True,rounded=True)

In [50]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

InvocationException: GraphViz's executables not found

InvocationException: GraphViz's executables not found