In [78]:
#@title Importing Packages
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")

In [101]:
#@title Functions
def Sex_Encoder(data):
  data[data == 'male'] = 1 
  data[data == 'female'] = 0
  return data 

def Embarked_Encoder(data):
  data[data == 'C'] = 2
  data[data == 'Q'] = 1 
  data[data == 'S'] = 0 
  return data

def prepro(data):
  # Filling empty numerical values with median
  data['Age'] = data['Age'].fillna(data['Age'].median())
  data['Fare'] = data['Fare'].fillna(data['Fare'].median())
  # Filling empty embarked with S
  data['Embarked'] = data['Embarked'].fillna('S')
  # Replacing categorical sex with integer values (0 for Female and 1 for Male)
  data['Sex'] = Sex_Encoder(data['Sex'])
  # Replacing categorical embarked with integer values (0 for S, 1 for Q, and 2 for C)
  data['Embarked'] = Embarked_Encoder(data['Embarked'])

def hb(p):
  if ((p == 0) | (p == 1)): return 0
  else: return (-p * np.log2(p) -(1-p)*np.log2(1-p)) 

def Entropy(x, y):
  p1 = sum(np.array(x == 0)) / len(x)  
  H1 = hb(sum(np.array((x == 0)) & (y == 0)) / len(x))
  p2 = 1 - p1
  H2 = hb(sum(np.array((x == 1)) & (y == 0)) / len(x))
  H = p1 * H1 + p2 * H2
  return H

def split_line_finder(X, Label):
  _ , feature_num = X.shape
  Features = X.columns
  H_lines = []; Features_lines = []
  for j in range(len(Features)):
    unique_values = np.sort(np.unique(X[Features[j]]))
    H = []; lines = []
    if len(unique_values) != 1:
      for i in range(len(unique_values)-1):
        lines.append((unique_values[i] + unique_values[i+1]) / 2)
        x = np.zeros(len(X[Features[j]]))
        x[X[Features[j]] <= lines[i]] = 0
        x[X[Features[j]] > lines[i]] = 1
        H.append(Entropy(x, Label))
    else:
      lines.append(unique_values[0])
      x = np.zeros(len(X[Features[j]]))
      x[X[Features[j]] <= lines[0]] = 0
      x[X[Features[j]] > lines[0]] = 1
      H.append(Entropy(x, Label))
    ind = np.argsort(H)
    H_lines.append(H[ind[0]])
    Features_lines.append(lines[ind[0]])
  ind = np.argsort(H_lines)
  best_line = Features_lines[ind[0]]
  feature = Features[ind[0]]  
  return best_line, feature

def splitter(X, Label, split_line, feature):
  X_branch1 = X[X[feature] <= split_line].drop(feature, axis=1)
  Y_branch1 = Label[X[feature] <= split_line]
  X_branch2 = X[X[feature] > split_line].drop(feature, axis=1)
  Y_branch2 = Label[X[feature] > split_line]
  level = 7 - len(X_branch1.columns)
  return X_branch1, X_branch2, Y_branch1, Y_branch2, level

def decision_tree(X, Label, depth, max_depth):
  if (depth == max_depth) or (sum(Label) == len(Label)) or (sum(Label) == 0) or (len(X.columns) == 0): 
    # Leaf node
    leaf_label = int(sum(Label) >= len(Label)/2)
    node = Node(label = leaf_label)
  else: 
    split_line, feature = split_line_finder(X, Label)
    X_branch1, X_branch2, Y_branch1, Y_branch2, level = splitter(X, Label, split_line, feature)
    branch1_node = decision_tree(X_branch1, Y_branch1, level, max_depth)
    branch2_node = decision_tree(X_branch2, Y_branch2, level, max_depth)
    node = Node(feature, split_line, branch1_node, branch2_node)
  return node

def classifier(tree, x):
  if tree.label != None:
    Pred_Label = tree.label
  else:
    if x[tree.feature] <= tree.thr:
      Pred_Label = classifier(tree.branch1_subtree, x)
    else:
      Pred_Label = classifier(tree.branch2_subtree, x)
  return Pred_Label

def Predict(X, tree):
  Pred_Labels = []
  n , _= X.shape
  for i in range(n):
    Pred_Labels.append(classifier(tree, X.iloc[i]))
  return np.array(Pred_Labels)

def Decision_Tree_metrics(True_Labels, Pred_Labels):
  acc = sum(True_Labels == Pred_Labels) / len(True_Labels) * 100
  confusion = np.matrix([[sum(np.array((True_Labels == 0) & (Pred_Labels == 0))), sum(np.array((True_Labels == 1) & (Pred_Labels == 0)))],
                         [sum(np.array((True_Labels == 0) & (Pred_Labels == 1))), sum(np.array((True_Labels == 1) & (Pred_Labels == 1)))]])
  return acc, confusion

def random_forest(X, Label, n_trees, N):
  trees = []
  for i in range(n_trees):
    rnd = random.sample(range(N),100)
    X_rnd = X.iloc[rnd]
    Y_rnd = Label.iloc[rnd]
    trees.append(decision_tree(X_rnd, Y_rnd, depth = 0, max_depth = 7))
  return trees

def random_forest_classifier(trees, x):
  Pred_Labels = []
  for i in range(len(trees)):
    Pred_Labels.append(classifier(trees[i], x))
  Pred_Label = sum(Pred_Labels) >= len(Pred_Labels)/2
  return Pred_Label

def random_forest_Predict(X, trees):
  Pred_Labels = []
  n , _= X.shape
  for i in range(n):
    Pred_Labels.append(random_forest_classifier(trees, X.iloc[i]))
  return np.array(Pred_Labels)

In [6]:
#@title Adding Dataset
Data = pd.read_csv("titanic-train.csv")
prepro(Data)
X_train, X_test, Y_train, Y_test = train_test_split(Data[Data.columns[Data.columns != 'Survived']], Data['Survived'], test_size = 0.2)

In [7]:
#@title Node Class
class Node():
  def __init__(self = None, feature = None, thr = None, branch1_subtree = None, branch2_subtree = None, label = None):
    self.feature = feature
    self.thr = thr
    self.branch1_subtree = branch1_subtree
    self.branch2_subtree = branch2_subtree
    # for the leaf nodes
    self.label = label

In [68]:
#@title Maximum Depth = 3
tree = decision_tree(X_train, Y_train, depth = 0, max_depth = 3)
Pred_Labels = Predict(X_test, tree)
accuracy, confusion = Decision_Tree_metrics(Y_test.values, Pred_Labels)
accuracy, confusion
print('Maximum Depth : 3\n accuracy =' , accuracy,'%\n confusion matrix:\n' ,confusion, '\n')


Maximum Depth : 3
 accuracy = 79.3296089385475 %
 confusion matrix:
 [[109  35]
 [  2  33]] 



In [72]:
#@title Maximum Depth = 4
tree = decision_tree(X_train, Y_train, depth = 0, max_depth = 4)
Pred_Labels = Predict(X_test, tree)
accuracy, confusion = Decision_Tree_metrics(Y_test.values, Pred_Labels)
accuracy, confusion
print('Maximum Depth : 4\n accuracy =' , accuracy,'%\n confusion matrix:\n' ,confusion, '\n')


Maximum Depth : 4
 accuracy = 81.00558659217877 %
 confusion matrix:
 [[109  32]
 [  2  36]] 



In [71]:
#@title Maximum Depth = 5
tree = decision_tree(X_train, Y_train, depth = 0, max_depth = 5)
Pred_Labels = Predict(X_test, tree)
accuracy, confusion = Decision_Tree_metrics(Y_test.values, Pred_Labels)
accuracy, confusion
print('Maximum Depth : 5\n accuracy =' , accuracy,'%\n confusion matrix:\n' ,confusion, '\n')


Maximum Depth : 5
 accuracy = 81.56424581005587 %
 confusion matrix:
 [[110  32]
 [  1  36]] 



In [70]:
#@title Maximum Depth = 6
tree = decision_tree(X_train, Y_train, depth = 0, max_depth = 6)
Pred_Labels = Predict(X_test, tree)
accuracy, confusion = Decision_Tree_metrics(Y_test.values, Pred_Labels)
accuracy, confusion
print('Maximum Depth : 6\n accuracy =' , accuracy,'%\n confusion matrix:\n' ,confusion, '\n')


Maximum Depth : 6
 accuracy = 82.12290502793296 %
 confusion matrix:
 [[110  31]
 [  1  37]] 



In [69]:
#@title Maximum Depth = 7
tree = decision_tree(X_train, Y_train, depth = 0, max_depth = 7)
Pred_Labels = Predict(X_test, tree)
accuracy, confusion = Decision_Tree_metrics(Y_test.values, Pred_Labels)
accuracy, confusion
print('Maximum Depth : 7\n accuracy =' , accuracy,'%\n confusion matrix:\n' ,confusion, '\n')


Maximum Depth : 7
 accuracy = 81.56424581005587 %
 confusion matrix:
 [[109  31]
 [  2  37]] 



In [99]:
#@title Random Forest
trees = random_forest(X_train, Y_train, n_trees = 10, N = 100)
Pred_Labels = random_forest_Predict(X_test, trees)
accuracy, confusion = Decision_Tree_metrics(Y_test.values, Pred_Labels)
accuracy, confusion
print('Random Forest with 10 trees : \n accuracy =' , accuracy,'%\n confusion matrix:\n' ,confusion, '\n')

Random Forest with 10 trees : 
 accuracy = 82.68156424581005 %
 confusion matrix:
 [[109  29]
 [  2  39]] 

