# Data Preprocessing

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv('datasets/titanic/train.csv')

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Drop the non-essential columns which have no role in deciding the output
cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']
clean_data = data.drop(cols_to_drop, axis = 1)

In [6]:
clean_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [7]:
# Data needs to be in numeric form so we convert sex column to numeric form
le = LabelEncoder()
clean_data['Sex'] = le.fit_transform(clean_data['Sex'])

In [8]:
clean_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [9]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


In [10]:
# Every column have 891 entries other than age which have only 714 entries that means some data is missing so we fill that with average of data in the column
clean_data = clean_data.fillna(clean_data['Age'].mean())

In [11]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


In [12]:
input_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
output_cols = ['Survived']

X = clean_data[input_cols]
y = clean_data[output_cols]

# Desion Tree Custom Implementation

In [13]:
# Calculate entropy for each feature equal to the sum of entropy for each unique value in the feature
def entropy(col):
    # counts store number of unique values in the given feature column
    data, counts = np.unique(col, return_counts= True)
    # N stores total number of element values in given column
    N = float(col.shape[0])
    
    ent = 0.0
    
    # Formula for calculating entropy: probability of value in feature * log2(probability of value in the feature)
    for count in counts:
        p = count / N
        ent += p * np.log2(p)
        
    return -ent

# Divides the data according to the given value of specified feature
def divide_data(x_data, fkey, fval):
    x_right = pd.DataFrame([], columns=x_data.columns)
    x_left = pd.DataFrame([], columns=x_data.columns)
    
    for xi in range(x_data.shape[0]):
        val = x_data[fkey].iloc[xi]
        
        # If the value is more than key value then data is added to right part else to the left
        if val > fval:
            x_right = x_right.append(x_data.loc[xi])
        else:
            x_left = x_left.append(x_data.loc[xi])
        
    return x_left, x_right

# Returns the information gain for given splitted data based on certain feature
def information_gain(x_data, fkey,fval):
    left, right = divide_data(x_data, fkey, fval)
    
    # % of elements in left and right data
    l = float(left.shape[0]) / x_data.shape[0]
    r = float(right.shape[0]) / x_data.shape[0]
    
    hs = entropy(x_data.Survived)
    
    igain = hs - (l * entropy(left.Survived) + r * entropy(right.Survived))
    return igain

In [14]:
for f in X.columns:
    print(f)
    print(information_gain(clean_data, f, clean_data[f].mean()))

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.001158644038169343
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464


In [40]:
class DecisionTree:
    
    def __init__(self, depth=0, max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None 
        
    def fit(self, X_train):
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
        info_gains = []
        
        # Calculate information gain for every feature
        for ix in features:
            i_gain = information_gain(X_train, ix, X_train[ix].mean())
            info_gains.append(i_gain)

        # Feature with maximum information gain is taken as base node and key value taken here is mean of all values in feature column
        self.fkey = features[np.argmax(info_gains)]
        self.fval = X_train[self.fkey].mean()
        

        data_left, data_right = divide_data(X_train, self.fkey, self.fval)
        # reset_index will reset the index again from starting for each subpart
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        
        # Base Cases: maximum value of data at the node is taken as target/answer which is calculated by taking the mean of all values and if more than 0.5 then belong to first class else other because data only divided in two parts
        # reached leaf node when node have no data in either left or right subpart
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if X_train.Survived.mean() >= 0.5:
                self.target = "Survived"
            else:
                self.target = "Dead"
            return
                
        # Model will stop when depth will become more then specified max_depth of the model
        if self.depth >= self.max_depth:
            if X_train.Survived.mean() >= 0.5:
                self.target = "Survived"
            else:
                self.target = "Dead"
            return
    
        # Recursive Case : Repeat the procedure for left and right data and increase depth by 1
        self.left = DecisionTree(depth=self.depth + 1)
        self.left.fit(data_left)
        
        self.right = DecisionTree(depth=self.depth + 1)
        self.right.fit(data_right)
        
        # Target can be setted at every node and not just leaf node
        if X_train.Survived.mean() >= 0.5:
            self.target = "Survived"
        else:
            self.target = "Dead"
        return
    
    def predict(self, test):
        if test[self.fkey] > self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
        

In [41]:
dt = DecisionTree()

In [42]:
# train_test_split will not work in this case because we are using dataframes
split = int(0.7*clean_data.shape[0])
train_data = clean_data[:split]
test_data = clean_data[split:]
test_data = test_data.reset_index(drop=True)

In [43]:
dt.fit(train_data)

In [44]:
y_pred = []
for i in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[i]))

In [45]:
y_pred[:10]

['Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead']

In [46]:
y_actual = test_data[output_cols]

In [47]:
data[split:][:10]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
623,624,0,3,"Hansen, Mr. Henry Damsgaard",male,21.0,0,0,350029,7.8542,,S
624,625,0,3,"Bowen, Mr. David John ""Dai""",male,21.0,0,0,54636,16.1,,S
625,626,0,1,"Sutton, Mr. Frederick",male,61.0,0,0,36963,32.3208,D50,S
626,627,0,2,"Kirkland, Rev. Charles Leonard",male,57.0,0,0,219533,12.35,,Q
627,628,1,1,"Longley, Miss. Gretchen Fiske",female,21.0,0,0,13502,77.9583,D9,S
628,629,0,3,"Bostandyeff, Mr. Guentcho",male,26.0,0,0,349224,7.8958,,S
629,630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S
631,632,0,3,"Lundahl, Mr. Johan Svensson",male,51.0,0,0,347743,7.0542,,S
632,633,1,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,13214,30.5,B50,C


In [48]:
y_actual[:10]

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1
5,0
6,0
7,1
8,0
9,1
