In [4]:
import numpy as np

class Sample:
    "Represenation of data samples"
    def __init__(self, xclass, values, identity):
        self.xclass = xclass
        self.attribute = dict(zip(attributes, values))
        self.identity = identity
        
    def getClass(self):
        return self.xclass
    
    def getAttributes(self):
        return self.attribute
    
    def getAttributeValue(self,attribute):
        return self.attribute[attribute]
    
    def getIdentity(self):
        return self.identity

class Attribute:
    "Label for each attribute"
    def __init__(self, name, values):
        self.name = name
        self.values = values

    def __repr__(self):
        return self.name
    
    def getValues(self):
        return self.values

# import data
workfile = 'breast-cancer-wisconsin.data'
f = open(workfile, 'r')
    
# index 0 = data ID
# index 1 to 9 features (feature values 1-10)
# index 10 = class (2 or 4)

attributes = (
                Attribute('Clump Thickness', range(1,11)),
                Attribute('Uniformity of Cell Size', range(1,11)),
                Attribute('Uniformity of Cell Shape', range(1,11)),
                Attribute('Marginal Adhesion', range(1,11)),
                Attribute('Single Epithelial Cell Size', range(1,11)),
                Attribute('Bare Nuclei', range(1,11)),
                Attribute('Bland Chromatin', range(1,11)),
                Attribute('Normal Nucleoli', range(1,11)),
                Attribute('Mitoses', range(1,11))
              )
CounterW = 0
data = []
for i in range(699):
    try:
        a = f.readline()
        b = a.split(',')
        dataID = int(b[0])
        dataFeature = tuple(map(int, b[1:10]))
        dataClass = int(b[10])
        if dataClass == 4:      #  4 for malignant
            dataClass = True
        else:                   #  2 for benign
            dataClass = False
        data.append(Sample(dataClass, dataFeature, dataID))
    except ValueError:          # Some data points had '?' in them instead of an int
        CounterW = CounterW + 1
        
print('Complete Data points: ', len(data))
print('inComplete Data points: ', CounterW)

Complete Data points:  683
inComplete Data points:  16


In [5]:
## Psuedo Code
'''
Precondition: A training set S := (x1, y1), . . . ,(xn, yn), features F, and number
of trees in forest B.
1 function RandomForest(S , F)
    2 H ← ∅
    3 for i ∈ 1, . . . , B do
        4 S(i) ← A bootstrap sample from S
        5 hi ← RandomizedTreeLearn(S(i), F)
        6 H ← H ∪ {hi}
    7 end for
    8 return H
9 end function

10 function RandomizedTreeLearn(S , F)
    11 At each node:
        12 f ← very small subset of F
        13 Split on best feature in f
    14 return The learned tree
15 end function
'''
import numpy as np
import math

def RandomForest(S, F):
    B = 10                     # number of trees in forest
    H = []
    for i in range(B):
        S_i = BootstrapSample(S)
        h_i = RandomizedTreeLearn(S_i, F)
        H.append(h_i)
    return H

def RandomizedTreeLearn(S, F, maxdepth = 5): # here S is a subsample 
    def buildBranch(dataset, default, attributes):
        if not dataset:
            return TreeLeaf(default)
        allClass,theClass = allAnyClass(dataset)
        if allClass:
            return TreeLeaf(theClass)
        return RandomizedTreeLearn(dataset, attributes, maxdepth-1)

    default = mostCommon(S)
    if maxdepth < 1:
        return TreeLeaf(default)
    f = FeatureSubsample(F) # feature subsample
    a = bestFeature(S, f)
    attributesLeft = [x for x in attributes if x != a]
    branches = [(v, buildBranch(select(S, a, v), default, attributesLeft))
                for v in a.getValues()]
    return TreeNode(a, dict(branches), default)

def BootstrapSample(S):
    soS = 10                   # size of subsample
    return np.random.choice(S, soS).tolist()

def FeatureSubsample(F):
    soS = 2                    # size of subsample, should be very small
    return np.random.choice(F, soS).tolist()

class TreeNode:
    def __init__(self, attribute, branches, default):
        self.attribute = attribute
        self.branches = branches
        self.default = default

    def __repr__(self):
        accum = str(self.attribute) + '('
        for x in sorted(self.branches):
            accum += str(self.branches[x])
        return accum + ')'
    
class TreeLeaf:
    def __init__(self, cvalue):
        self.cvalue = cvalue

    def __repr__(self):
        return str(self.cvalue)

def bestFeature(S, features):
    gains = [(averageGain(S, f), f) for f in features]
    return max(gains, key=lambda x: x[0])[1]

def mostCommon(S):
    classes = list(set([x.getClass() for x in S]))
    return classes[np.argmax([len([x for x in S if x.getClass()==c]) for c in classes])]

def averageGain(dataset, attribute):
    weighted = 0.0
    for v in attribute.values:
        subset = select(dataset, attribute, v)
        weighted += entropy(subset) * len(subset)
    return entropy(dataset) - weighted/len(dataset)

def entropy(dataset):
    "Calculate the entropy of a dataset"
    classes = list(set([x.getClass() for x in dataset]))
    n = len(dataset)
    entropy = 0
    for c in classes:
        nclass = len([x for x in dataset if x.getClass() == c])
        if nclass == 0:
            continue
        else:
            entropy -= float(nclass)/n * math.log(float(nclass)/n,2)
    return entropy
    
def select(dataset, attribute, value):
    #print('value', value)
    #print('attribute', attribute)
    #print(dataset[0].attribute[attribute])
    return [x for x in dataset if x.getAttributeValue(attribute) == value]

def allFromClass(dataset,c):
    "Check if all samples are from class c"
    return all([x.getClass() == c for x in dataset])

def allAnyClass(dataset):
    "Check if all samples are from the same class"
    c = dataset[0].getClass()
    if allFromClass(dataset,c):
        return (True,c)
    return (False,0)

In [6]:
S = data
F = attributes
H = RandomForest(S,F)
print('Done')

Done


In [7]:
def classify(tree, sample):
    "Classify a sample using the given decition tree"
    if isinstance(tree, TreeLeaf):
        return tree.cvalue
    return classify(tree.branches[sample.attribute[tree.attribute]], sample)

def classifyForest(forest, sample):
    "Classify a sample using the given decition tree"
    classifications = []
    for tree in forest:
        classifications += [classify(tree, sample)]
    counts = [(c,classifications.count(c)) for c in set(classifications)]
    return sorted(counts,key=lambda x: x[1])[-1][0]
    

def check(tree, testdata):
    "Measure fraction of correctly classified samples"
    correct = 0
    for x in testdata:
        if classify(tree, x) == x.getClass():
            correct += 1
    return float(correct)/len(testdata)

def checkForest(forest, testdata):
    "Measure fraction of correctly classified samples"
    correct = 0
    for x in testdata:
        if classifyForest(forest, x) == x.getClass():
            correct += 1
    return float(correct)/len(testdata)

In [8]:
print(checkForest(H,data))

0.7613469985358712
