In [1]:
import math
import copy
import random
import numpy as np 
import statistics
import matplotlib.pyplot as plt

In [2]:
class Node:
    def __init__(self, label):
        self.label = label
        self.children = dict()

    def isLeaf(self):
        return len(self.children) == 0

In [3]:
def entropy(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    h = 0.0
    norm = _weighted(data)
    for (label, count) in counts.items():
        ratio = count / (norm)
        h -= math.log(ratio, 2) * ratio

    return h

def gini_index(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    h = 0.0
    norm =  _weighted(data)
    for (label, count) in counts.items():
        ratio = count / float(norm)
        h +=ratio **2

    return h

def ME(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    h = 0.0
    norm = _weighted(data)
    for (label, count) in counts.items():
        ratio = count / float(norm)
        h = max(h, ratio)

    return 1-h



def info_gain(data,gain_type,attribute, vals):
    
    h = None
    new_h = 0.0
    if gain_type == 0:
        h=entropy(data)
           
    elif gain_type == 1:
        h=gini_index(data)
        
    elif gain_type == 2:
        h=ME(data)
            
    for val in vals:
        sub_data = set_subdata(data, attribute, val)
        ratio = _weighted(sub_data) / float(_weighted(data))
        new_h += ratio * h
        
    return h - new_h

In [4]:
def majority_label(data):
    
    counts = dict()
    for row in data:
        label = row['y']
        
        if label not in counts:
            counts[label] = 0.0
            
        counts[label] += row['weights']
        
    common_label=max(counts.keys(), key=lambda key: counts[key])

    return common_label

In [5]:
def WID3(data,gain_type, attributes, labels, max_depth,depth):
    
    if (len(labels) == 1) or (len(attributes) == 0) or depth==max_depth:
        label = majority_label(data)
        
        return Node(label)

    #recursion
    max_attr = select_feature(data,gain_type,attributes)
    root = Node(max_attr)
    

    # split into subsets
    for v in attributes[max_attr]:
        sub_data = set_subdata(data, max_attr, v)

        if len(sub_data) == 0:
            label = majority_label(data)
            root.children[v] = Node(label)
            
        else:
            
            sub_attributes = copy.deepcopy(attributes)
            sub_attributes.pop(max_attr)

            # update subset labels set
            sub_labels = set()
            for row in sub_data:
                sub_label = row['y']
                if  sub_labels not in sub_labels:
                    sub_labels.add(sub_label)

            # recursion
            root.children[v] = WID3(sub_data, gain_type,sub_attributes, sub_labels, max_depth, depth+1)

    return root

In [6]:
def select_feature(data,gain_type, attributes):
    gain_x= dict()

    for ln, lv in attributes.items():
        gain = info_gain(data,gain_type, ln, lv)
        gain_x[ln] = gain
        max_attr=max(gain_x.keys(), key=lambda key: gain_x[key])

    return max_attr

In [7]:
def set_subdata(data, attribute, val):
    sub_data = []

    for row in data:
        if row[attribute] == val:
            sub_data.append(row)

    return sub_data

In [8]:
def _weighted(data):
    
    length = 0.0
    for row in data:
        length += row['weights']
    
    return length

def get_label(row, root):
    new_node = root

    while not new_node.isLeaf():
        curr_attr = new_node.label
        attr_val = row[curr_attr]
        new_node = new_node.children[attr_val]

    return new_node.label

def weighted_error(data, root):
    
    error = 0.0
    for row in data:
        label = get_label(row, root)
        if label != row['y']:
            error += row['weights']

    return error

In [9]:
def ID3_RF(data, attributes, labels,size):
    attr_subset = rf_features(attributes, size)
    return WID3(data, 0,attributes, labels,size, 0)

In [10]:
def rf_features(attributes,size):
    attr_subset = dict()
    attr_copy = list(attributes.keys())

    
    while len(attr_subset) < size:
        idx = random.randint(0, len(attr_copy) - 1)
        attr = attr_copy[idx]
        if attr not in attr_subset:
            attr_subset[attr] = attributes[attr]

    return attr_subset

In [11]:
def RF(data, attributes, labels, T,size):
    
    DT = []
    for t in range(0, T):
       
        subset = [random.choice(data) for i in range(len(data))]
        dt= ID3_RF(subset, attributes, labels, size)
        DT.append(dt)

    return DT

In [12]:
def _predict(data, DT):
    h_rate = 0

    for row in data:
        prediction = 0.0
        
        for dt in DT:
            label = get_label(row, dt)
            label = 1 if label == 'yes' else -1
            prediction += label

        if row['y'] == 'yes' and prediction > 0:
            h_rate += 1
        if row['y'] == 'no' and prediction < 0:
            h_rate += 1
    
    return h_rate/len(data),prediction


In [13]:
attributes = {'X1': [0, 1], 
                'X2': [1, 2], 
                'X3': [0,1,2,3,4,5,6], 
                'X4': [0,1,2,3],
                'X5': [0, 1],
                'X6': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X7': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X8': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X9': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X10': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X11': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X12': [0, 1],
                'X13': [0, 1],
                'X14': [0, 1],
                'X15': [0, 1],
                'X16': [0, 1],
                'X17': [0, 1],
                'X18': [0, 1],
                'X19': [0, 1],
                'X20': [0, 1],
                'X21': [0, 1],
                'X22': [0, 1],
                'X23': [0, 1],}

In [14]:
if __name__ == '__main__':
    
    columns = ['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23','y']
    labels = {'Y': [0, 1]}
    numeric_features = ['X1', 'X5', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']

In [15]:
# read training data
train_data=[]
with open('train.csv', 'r') as f:
    for line in f:
        example = dict()
        terms = line.strip().split(',')
        for i in range(len(terms)):
            attrName = columns[i]
            example[attrName] = terms[i]
        
        train_data.append(example)

In [16]:
# read test data
test_data=[]
with open('test.csv', 'r') as f:
    for line in f:
        example = dict()
        terms = line.strip().split(',')
        for i in range(len(terms)):
            attrName = columns[i]
            example[attrName] = terms[i]
        
        test_data.append(example)

In [17]:
#convert numeric variable to binary variable
medians = dict()
for attr in columns:
    try:
        float(train_data[0][attr])
        medians[attr] = 0.0
    except ValueError:
        pass

# calculate and store median

for attr in medians.keys():
    attr_vals = []
    for row in train_data:
        attr_vals.append(float(row[attr]))
    medians[attr] = statistics.median(attr_vals)

In [18]:
#if >median, 1 else 0
for key, value in medians.items():
    for row in train_data:
        val = float(row[key])
        row[key] = 1 if val > value else 0

    for row in test_data:
        val = float(row[key])
        row[key] = 1 if val > value else 0

In [19]:
#initialize the weight
train_size=len(train_data)
test_size=len(test_data)
for row in train_data:
    row['weights'] = 1/train_size

for row in test_data:
    row['weights'] = 1/test_size
    

In [None]:
for T in range(500):
    dt = RF(train_data, attributes, labels, T,2)
    h_rate = _predict(train_data, dt)
    h_test = _predict(test_data, dt)

In [None]:
def without_replacement(data, size):
    data_copy = copy.deepcopy(data)
    sample = []
    for i in range(size):
        ridx = random.randint(0, len(data_copy)-1)
        sample.append(data_copy[idx])
        del data_copy[idx]
    return sample

pred = []
for i in range(100):
    sample = without_replacement(train, 1000)
    DT = RF(sample, attributes, labels, 500)
    pred.append(DT)

In [None]:
#single tree bias and variance
single_bias = 0.0
single_var = 0.0
sum_single_bias=0.0
sum_single_var=0.0
for row in test_data:
    avg = 0
    predictions = []
    for DT in pred:
        dt= DT[0]
        label = get_label(row, dt)
        label = 1 if label == 'yes' else -1
        avg += label
        predictions.append(label)
    avg /= len(predictions)
    y = 1 if row['y'] == 'yes' else -1
    single_bias = pow(y - avg, 2)
    single_var = np.var(predictions)

    sum_single_bias += single_bias
    sum_single_var += single_var
    
avg_single_bias = sum_single_bias/len(test_data)
avg_single_var = sum_single_var/len(test_data)

In [None]:
#single tree bias and variance
rf_bias = 0.0
rf_var = 0.0
sum_rf_bias=0.0
sum_rf_var=0.0
for row in test_data:
    avg = 0
    predictions = []
    for DT in pred:
        dt= DT[0]
        label = get_label(row, dt)
        label = 1 if label == 'yes' else -1
        avg += label
        predictions.append(label)
    avg /= len(predictions)
    y = 1 if row['y'] == 'yes' else -1
    rf_bias = pow(y - avg, 2)
    rf_var = np.var(predictions)

    sum_rf_bias += bag_bias
    sum_rf_var += bag_var

avg_rf_bias = sum_rf_bias/len(test_data)
avg_rf_var = sum_rf_var/len(test_data)