# Tree Methods

## Imported Libraries

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib

## Data Extraction

In [2]:
trainingData = pd.read_csv("../Data/train.csv")
trainingData.hist(bins = 10)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x100DC3B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x03F70450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x111686F0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1118B7D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x111AA8B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x111CB990>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x111EBAF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1120D5B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1120DB50>]],
      dtype=object)

## Binning Analog Data

In [3]:
AgeBins = pd.IntervalIndex.from_tuples([(0,15),(15,30),(30,45),(45,60),(60,75),(75,90),(90,105),(105,120)])
TicketBins = pd.IntervalIndex.from_tuples([(0,50000), (50000,100000), (100000,150000),(150000,200000),(200000, 250000), (250000, 300000), (300000, 350000)])
CostBins = pd.IntervalIndex.from_tuples([(-1, 50),(50,100), (100,150), (150, 200), (200, 250), (250, 300), (300,350), (350,400), (400, 450), (450, 500), (500, 550)])

trainingData["AgeGroup"] = pd.cut(trainingData['Age'], bins=AgeBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['TicketGroup'] = pd.cut(pd.to_numeric(trainingData['Ticket'], errors="coerce"), bins=TicketBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['CostGroup'] = pd.cut(trainingData['Fare'], bins=CostBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))

## Tree Data Structure

In [4]:
class Tree:
    
    def __init__(self, feature, children):
        self.feature = feature
        self.children = children
        
    def getFeature(self):
        return self.feature
    
    def search(self,datapoint):
        if self.children is None:
            return self.getFeature()
        point = datapoint.get(self.feature)
        for child in self.children:
            if child[0] == point:
                return child[1].search(datapoint)
        

## Tree Structure Test

In [5]:
leaf1 = Tree(0, None)
leaf2 = Tree(1, None)
leaf3 = Tree(1, None)

branch = Tree("test2", [["b1",leaf1], ["b2",leaf2]])
root = Tree("test1", [["a1",branch], ["a2",leaf3]])

EXdata = pd.Series(data=["a1","b1"],index=["test1","test2"])

assert 0 == root.search(EXdata)

## Information Gain Functions

In [6]:
def Entropy(dataSet):
    result1 = dataSet[dataSet['Survived'] == 1].size/dataSet.size
    if result1 == 0 or result1 == 1:
        return 0
    result1 = result1*np.log2(result1)
    result2 = dataSet[dataSet['Survived'] == 0].size/dataSet.size
    result2 =  result2*np.log2(result2)
    return -1* (result1 + result2)

In [7]:
def InformationGain(dataSet, feature):
    
    totalE = Entropy(dataSet)
    sumE = 0
    
    values, counts = np.unique(dataSet[feature], return_counts = True)
    for val in values:
        sumE += (dataSet[dataSet[feature] == val].size)/dataSet.size*Entropy(dataSet[dataSet[feature] == val])
    return totalE - sumE
        

0.06112152898423162


## Tree Builder

In [70]:
def BuildTree(depth,features,dataset):
    
    print(features)
    
    values, counts = np.unique(dataset["Survived"], return_counts = True)
    if counts[0] == dataset.size:
        return Tree(values[0], None)
    
        
    
    best = None
    bestIG = 0
    for feat in features:
        IG = InformationGain(dataset, feat)
        if best == None or bestIG < IG:
            best = feat
            bestIG = IG
    
    children = []
    if depth == 1 or len(features) == 1:
        for val in np.unique(dataset[best]):
            children.append([val,Tree(dataset[dataset[best] == val].mode().loc[0, "Survived"], None)])
    else:
        subset = features.copy()
        subset.remove(best)
        for val in np.unique(dataset[best]):
            
            children.append([val, BuildTree(depth-1, subset, dataset[dataset[best] == val])])
                            
    return Tree(best, children)



## Testing Tree Builder

In [71]:
testTree = BuildTree(3, ["AgeGroup", "CostGroup", "TicketGroup"], trainingData)
testTree.search(trainingData.loc[4])

['AgeGroup', 'CostGroup', 'TicketGroup']
['AgeGroup', 'TicketGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup', 'TicketGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup', 'TicketGroup']
['TicketGroup']
['TicketGroup']
['TicketGroup']
['TicketGroup']
['TicketGroup']
['AgeGroup', 'TicketGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup']
['AgeGroup', 'TicketGroup']
['TicketGroup']
['TicketGroup']
['TicketGroup']
['TicketGroup']
['TicketGroup']
['AgeGroup', 'TicketGroup']
['TicketGroup']
['TicketGroup']
['AgeGroup', 'TicketGroup']
['TicketGroup']


0.0