In [55]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import enum
import time
import random
import multiprocessing as mp
from sklearn import tree

In [5]:
def get_dataframe(file_name):
  df = pd.read_csv(file_name, low_memory=False)
  nans(df)
  df.Age = df.Age/np.max(df.Age)
  df.Fare = np.log1p(df.Fare)
  df.Sex = pd.Categorical(df.Sex)
  df.Pclass = pd.Categorical(df.Pclass)
  df.Embarked = pd.Categorical(df.Embarked)
  return df
def nans(df):
  if not df.isnull().sum().any():
    return
  df.fillna(df.mode().iloc[0] ,inplace=True)
  return df


In [6]:
df = get_dataframe("train.csv")
df.drop(columns=["PassengerId", "Ticket", "Cabin", "Name"],inplace=True)

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,0.275,1,0,2.110213,S
1,1,1,female,0.475,1,0,4.280593,C
2,1,3,female,0.325,0,0,2.188856,S
3,1,1,female,0.4375,1,0,3.990834,S
4,0,3,male,0.4375,0,0,2.202765,S


In [79]:
class myException(Exception):
  pass
class Random_Splitter:
  def __init__(self,df,split_amount=0.2):
    self.df = df
    self.split_amount = split_amount
  def get_dfs(self):
    remove= int(len(self.df) * self.split_amount)
    num = random.sample(range(len(self.df)), remove)
    validation = self.df.iloc[num].copy()
    test = self.df.drop(index=num)
    return (validation.reset_index(drop=True),test.reset_index(drop=True))
test =Random_Splitter(df)
validation,train = test.get_dfs()

In [80]:
def return_split(df, name, split):
  temp =df[name]
  return (df.loc[~(temp == split)], df.loc[(temp == split)]) if temp.dtype.name == 'category' else (df.loc[~(temp <= split)], df.loc[(temp <= split)])
def return_greatest(input):
  result= (None,float('-inf'))
  for x in input:
    _, high = result
    result = result if high >= input[x] else (x,input[x])
  return result
class Node:
  dependent = None
  def __init__(self,df=None, dep=None, min_sample=50,dependent = None):
    self.df= df
    self.entropy = 0
    self.min_sample = min_sample
    self.split = (None, None)
    self.col = None
    self.left_node = None
    self.right_node = None
    self.dep = dep
    if(not Node.dependent and dependent):
      Node.dependent = dependent
  def __repr__(self):
    return f'class: {self.dep}'
  def ret_output(self):
    return self.dep
  @staticmethod
  def get_score(dependent_var:pd.Series)->float:
    split = dependent_var.value_counts()
    sum = len(dependent_var)
    result = 0.0
    for x in split:
      prob = x / sum
      result -= prob*np.log2(prob)
    return result
  def find_diff(self, left, right):
      if not self.entropy:
        self.entropy = Node.get_score(self.df[Node.dependent])
      sum = len(self.df[Node.dependent])
      return self.entropy-(((left.count()/sum) * Node.get_score(left)) + ((right.count()/sum) * Node.get_score(right)))
  def get_entropy_split(self, name):
    if(len(np.unique(self.df[name].values)) == 1):
      return np.nan
    result = {}
    for num in df[name].unique():
      left, right = return_split(self.df, name, num)
      if (len(left) >= self.min_sample and len(right) >= self.min_sample):
        result[num] = self.find_diff(left[Node.dependent], right[Node.dependent])
    return return_greatest(result) if result else np.nan
  def split_on(self, name, num):
    left, right = return_split(self.df,name,num)
    self.col = name
    self.left_node = Node(left.copy(), 1, self.min_sample)
    self.right_node = Node(right.copy(),0 , self.min_sample)
    self.split = (
        True if self.df[name].name == 'category' else False,
        num
    )
    assert(len(left) >= self.min_sample and len(right) >= self.min_sample)
    return
  def set_split_number(self):
    return
  def predict(self, number):
    cat, num = self.split
    result = None
    if (cat == None and num ==None ):
      return False
    if cat:
      result = self.right_node if num == number else self.left_node
    else:
      result = self.right_node if num <= number else self.left_node
    return result


In [81]:
class Queue:
  def __init__(self):
    self.data = []
  def enqueue(self, *args):
    self.data.extend(args)
  def dequeue(self):
    return self.data.pop(0) if len(self.data) >0 else None
  def empty(self):
    return bool(len(self.data))
class Decision_Tree:
  def __init__(self,columns, df, min_sample= 50):
    self.root_node = Node(df, dependent="Survived", min_sample=min_sample)
    self.entropies = pd.DataFrame({"num_split": [None] * len(columns) , "entropy": [np.nan] * len(columns)},index=columns)
    self.build_tree()
  def get_best_split(self,node:Node)-> pd.Series:
    for col_name in self.entropies.index:
      self.entropies.loc[col_name] = node.get_entropy_split(col_name)
    return self.entropies.loc[:,'entropy']
  def predict(self, row):
    que = Queue()
    que.enqueue(self.root_node)
    result = None
    while((temp:= que.dequeue()) and temp.col):
        p = temp.predict(row[temp.col])
        if(p.ret_output() != None):
          que.enqueue(p)
          result = p.ret_output()
    return result
  def MSE(self, df)->float:
    answers = pd.Series({"Survived" :[np.nan] * len(df.Survived)})
    actual = df.Survived
    input =  df.drop("Survived",axis=1)
    answers = input.apply(self.predict, axis=1)
  def accuracy(self, validation_set):
    correct_valid = validation_set.Survived
    predictions = validation_set.drop("Survived",axis=1).apply(self.predict,axis=1)
    arr = (correct_valid == predictions).value_counts().loc[True]
    return (arr/len(validation_set)) * 100
  def build_tree(self):
    que = Queue()
    que.enqueue(self.root_node)
    while((temp:= que.dequeue())):
      name = self.get_best_split(temp)
      if (not name.isna().all()):
        temp.split_on(name.idxmax(), self.entropies.loc[name.idxmax()].num_split)
        que.enqueue(temp.left_node, temp.right_node)


In [117]:
from statistics import mean
class random_forest_tree:
    def __init__(self, df,number_of_trees=25, leave_out=0.25, min_samples=50):
        self.number_of_trees = number_of_trees 
        self.min_samples = min_samples 
        self.splitter = Random_Splitter(df, leave_out)
        self.trees = self.build_forest()
    def build_forest(self):
        trees = []
        for _ in range(self.number_of_trees):
            __, train = self.splitter.get_dfs()
            temp = Decision_Tree(train.columns[train.columns != "Survived"], train, min_sample =self.min_samples) 
            trees.append(temp)        
        return trees
    def predict(self, row):
        results = [tree.predict(row) for tree in self.trees]
        return 1 if mean(results) >= 0.5 else 0 
    def accuracy(self, validation_set):
        correct_valid = validation_set.Survived
        predictions = validation_set.drop("Survived",axis=1).apply(self.predict,axis=1)
        arr = (correct_valid == predictions).value_counts().loc[True]
        return (arr/len(validation_set)) * 100

In [118]:
temp = random_forest_tree(train)

In [119]:
temp.accuracy(validation)

71.34831460674157

Survived           1
Pclass             3
Sex           female
Age            0.275
SibSp              0
Parch              0
Fare        2.171907
Embarked           S
Name: 17, dtype: object

In [25]:
import dis