In [78]:
import numpy as np

class Sample:
    "Represenation of data samples"
    def __init__(self, positive, values, identity):
        self.positive = positive
        self.attribute = dict(zip(attributes, values))
        self.identity = identity

class Attribute:
    "Label for each attribute"
    def __init__(self, name, values):
        self.name = name
        self.values = values

    def __repr__(self):
        return self.name

# import data
workfile = 'breast-cancer-wisconsin.data'
f = open(workfile, 'r')
    
# index 0 = data ID
# index 1 to 9 features (feature values 1-10)
# index 10 = class (2 or 4)

attributes = (
                Attribute('Clump Thickness', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Uniformity of Cell Size', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Uniformity of Cell Shape', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Marginal Adhesion', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Single Epithelial Cell Size', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Bare Nuclei', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Bland Chromatin', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Normal Nucleoli', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
                Attribute('Mitoses', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
              )
CounterW = 0
data = []
for i in range(699):
    try:
        a = f.readline()
        b = a.split(',')
        dataID = int(b[0])
        dataFeature = tuple(map(int, b[1:9]))
        dataClass = int(b[10])
        if dataClass == 4:      #  4 for malignant
            dataClass = True
        else:                   #  2 for benign
            dataClass = False
        data.append(Sample(dataClass, dataFeature, dataID))
    except ValueError:          # Some data points had '?' in them instead of an int
        CounterW = CounterW + 1
        
print('Complete Data points: ', len(data))
print('inComplete Data points: ', CounterW)

Complete Data points:  683
inComplete Data points:  16


In [76]:
## Psuedo Code
'''
Precondition: A training set S := (x1, y1), . . . ,(xn, yn), features F, and number
of trees in forest B.
1 function RandomForest(S , F)
    2 H ← ∅
    3 for i ∈ 1, . . . , B do
        4 S(i) ← A bootstrap sample from S
        5 hi ← RandomizedTreeLearn(S(i), F)
        6 H ← H ∪ {hi}
    7 end for
    8 return H
9 end function

10 function RandomizedTreeLearn(S , F)
    11 At each node:
        12 f ← very small subset of F
        13 Split on best feature in f
    14 return The learned tree
15 end function
'''
import numpy as np
import math

def RandomForest(S, F):
    B = 10                     # number of trees in forest
    H = []
    for i in range(B):
        S_i = BootstrapSample(S)
        h_i = RandomizedTreeLearn(S_i, F)
        H.append(h_i)
    return H

def RandomizedTreeLearn(S, F, maxdepth = 5): # here S is a subsample 
    def buildBranch(dataset, default, attributes):
        if not dataset:
            return TreeLeaf(default)
        if allPositive(dataset):
            return TreeLeaf(True)
        if allNegative(dataset):
            return TreeLeaf(False)
        return RandomizedTreeLearn(dataset, attributes, maxdepth-1)

    default = mostCommon(S)
    if maxdepth < 1:
        return TreeLeaf(default)
    f = FeatureSubsample(F) # feature subsample
    a = bestFeature(S, f)
    attributesLeft = [x for x in attributes if x != a]
    branches = [(v, buildBranch(select(S, a, v), default, attributesLeft))
                for v in a.values]
    return TreeNode(a, dict(branches), default)

def BootstrapSample(S):
    soS = 10                   # size of subsample
    return np.random.choice(S, soS).tolist()

def FeatureSubsample(F):
    soS = 2                    # size of subsample, should be very small
    return np.random.choice(F, soS).tolist()

class TreeNode:
    def __init__(self, attribute, branches, default):
        self.attribute = attribute
        self.branches = branches
        self.default = default

    def __repr__(self):
        accum = str(self.attribute) + '('
        for x in sorted(self.branches):
            accum += str(self.branches[x])
        return accum + ')'
    
class TreeLeaf:
    def __init__(self, cvalue):
        self.cvalue = cvalue

    def __repr__(self):
        if self.cvalue:
            return '+'
        return '-'

def bestFeature(S, features):
    gains = [(averageGain(S, f), f) for f in features]
    return max(gains, key=lambda x: x[0])[1]

def mostCommon(S):
    pCount = len([x for x in S if x.positive])
    nCount = len([x for x in S if not x.positive])
    return pCount > nCount

def averageGain(dataset, attribute):
    weighted = 0.0
    for v in attribute.values:
        subset = select(dataset, attribute, v)
        weighted += entropy(subset) * len(subset)
    return entropy(dataset) - weighted/len(dataset)

def entropy(dataset):
    n = len(dataset)
    nPos = len([x for x in dataset if x.positive])
    nNeg = n - nPos
    if nPos == 0 or nNeg == 0:
        return 0.0
    return -float(nPos)/n * log2(float(nPos)/n) + \
        -float(nNeg)/n * log2(float(nNeg)/n)
    
def select(dataset, attribute, value):
    print('value', value)
    print('attribute', attribute)
    #print(dataset[0].attribute[attribute])
    return [x for x in dataset if x.attribute[attribute] == value]

def log2(x):
    return math.log(x, 2)

def allPositive(dataset):
    return all([x.positive for x in dataset])


def allNegative(dataset):
    return not any([x.positive for x in dataset])


In [77]:
S = data
F = attributes
H = RandomForest(S,F)
print('Done')

value 1
attribute Uniformity of Cell Size
value 2
attribute Uniformity of Cell Size
value 3
attribute Uniformity of Cell Size
value 4
attribute Uniformity of Cell Size
value 5
attribute Uniformity of Cell Size
value 6
attribute Uniformity of Cell Size
value 7
attribute Uniformity of Cell Size
value 8
attribute Uniformity of Cell Size
value 9
attribute Uniformity of Cell Size
value 10
attribute Uniformity of Cell Size
value 1
attribute Uniformity of Cell Size
value 2
attribute Uniformity of Cell Size
value 3
attribute Uniformity of Cell Size
value 4
attribute Uniformity of Cell Size
value 5
attribute Uniformity of Cell Size
value 6
attribute Uniformity of Cell Size
value 7
attribute Uniformity of Cell Size
value 8
attribute Uniformity of Cell Size
value 9
attribute Uniformity of Cell Size
value 10
attribute Uniformity of Cell Size
value 1
attribute Uniformity of Cell Size
value 2
attribute Uniformity of Cell Size
value 3
attribute Uniformity of Cell Size
value 4
attribute Uniformity of 

KeyError: Mitoses