In [1]:
import math
import copy
import numpy as np 
import statistics
import matplotlib.pyplot as plt

In [2]:
class Node:
    def __init__(self, label):
        self.label = label
        self.children = dict()

    def isLeaf(self):
        return len(self.children) == 0

In [3]:
def get_total(data):
    total = 0.0

    for row in data:
        total += row['weights']
    
    return total

In [4]:
def set_subset(data, attribute, val):
    sub_data = []

    for row in data:
        if row[attribute] == val:
            sub_data.append(row)

    return sub_data

In [5]:
def set_label(row, dt):
    new_dt = dt

    while not new_dt.isLeaf():
        curr_attr = new_dt.label
        attr_val = row[curr_attr]
        new_dt = new_dt.children[attr_val]

    return new_dt.label

In [6]:
def entropy(data):
    
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
            
        counts[label] += row['weights']

    entropy = 0.0
    total = get_total(data)
    for (label, count) in counts.items():
        p = count / total
        entropy += -p * math.log2(p)

    return entropy

def gini_index(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    sq_sum = 0.0
    total =  get_total(data)
    for (label, count) in counts.items():
        p = count / total
        sq_sum +=p **2

    return 1-sq_sum

def ME(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    max_p = 0.0
    total = get_total(data)
    for (label, count) in counts.items():
        p = count / total
        max_p = max(max_p, p)

    return 1-max_p

In [7]:
def info_gain(data,gain_type,attribute, vals):
    
    measure = None
    gain = 0.0
    if gain_type == 0:
        measure=entropy(data)
           
    elif gain_type == 1:
         measure=gini_index(data)
        
    elif gain_type == 2:
         measure=ME(data)
            
    for val in vals:
        sub_set = set_subset(data, attribute, val)
        total=get_total(data)
        sub_total=get_total(sub_set)
        p = sub_total /total
        gain += p * measure
        gain_x= measure-gain
        
    return gain_x

In [8]:
def select_feature(data,gain_type, attributes):
    gain_x= dict()

    for ln, lv in attributes.items():
        gain = info_gain(data,gain_type, ln, lv)
        gain_x[ln] = gain
        max_attr=max(gain_x.keys(), key=lambda key: gain_x[key])

    return max_attr

In [9]:
def majority_label(data):
    
    counts = dict()
    for row in data:
        label = row['y']
        
        if label not in counts:
            counts[label] = 0.0
            
        counts[label] += row['weights']
        
    common_label=max(counts.keys(), key=lambda key: counts[key])

    return common_label

In [10]:
def ID3(data,gain_type, attributes, labels, max_depth,depth,size):
    
    if (len(attributes) == 0) or depth==max_depth:
        label = majority_label(data)
        
        return Node(label)
    
    if (len(labels) == 1):
        label = labels.pop()
        
        return Node(label)

    #recursion
    max_attr = select_feature(data,gain_type,attributes)
    root = Node(max_attr)
    

    # split into subsets
    for v in attributes[max_attr]:
        sub_set = set_subset(data, max_attr, v)

        if len(sub_set) == 0:
            label = majority_label(data)
            root.children[v] = Node(label)
            
        else:
            
            sub_attributes = copy.deepcopy(attributes)
            sub_attributes.pop(max_attr)

            # update subset labels set
            sub_labels = set()
            for row in sub_set:
                sub_label = row['y']
                if  sub_labels not in sub_labels:
                    sub_labels.add(sub_label)

            # recursion
            root.children[v] = ID3(sub_set, gain_type,sub_attributes, sub_labels, max_depth, depth+1,size)

    return root

In [11]:
def cls(data, gain_type,attributes, labels,max_depth,depth,T,size):
    
    DT=[]
    for t in range(0, T):
        random_data=[random.choice(data) for i in range(len(data))]
        dt= ID3(data, gain_type,attributes, labels, max_depth,depth,size)
        DT.append(dt)
    
    return DT

In [12]:
def prediction(data, DT):
    
    h_rate = 0

    for row in data:
        pred = 0.0
        
        for dt in DT:
            label = set_label(row,dt)
            label = 1 if label == 'yes' else -1
            pred += label 

        if row['y'] == 'yes' and pred > 0:
            h_rate += 1
        if row['y'] == 'no' and pred < 0:
            h_rate += 1
    
    return h_rate/len(data)

In [13]:
attributes = {'X1': [0, 1], 
                'X2': [1, 2], 
                'X3': [0,1,2,3,4,5,6], 
                'X4': [0,1,2,3],
                'X5': [0, 1],
                'X6': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X7': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X8': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X9': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X10': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X11': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                'X12': [0, 1],
                'X13': [0, 1],
                'X14': [0, 1],
                'X15': [0, 1],
                'X16': [0, 1],
                'X17': [0, 1],
                'X18': [0, 1],
                'X19': [0, 1],
                'X20': [0, 1],
                'X21': [0, 1],
                'X22': [0, 1],
                'X23': [0, 1],}

In [14]:
if __name__ == '__main__':
    
    columns = ['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23','y']
    labels = {"yes","no"}
    numeric_features = ['X1', 'X5', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']

In [15]:
# read training data
train_data=[]
with open('train.csv', 'r') as f:
    for line in f:
        example = dict()
        terms = line.strip().split(',')
        for i in range(len(terms)):
            attrName = columns[i]
            example[attrName] = terms[i]
        
        train_data.append(example)

In [16]:
# read test data
test_data=[]
with open('test.csv', 'r') as f:
    for line in f:
        example = dict()
        terms = line.strip().split(',')
        for i in range(len(terms)):
            attrName = columns[i]
            example[attrName] = terms[i]
        
        test_data.append(example)

In [17]:
medians = {'X1':0, 'X5':0, 'X12':0, 'X13':0, 'X14':0, 'X15':0, 'X16':0, 'X17':0, 'X18':0, 'X19':0, 'X20':0, 'X21':0, 'X22':0,"X23":0}
for key in medians.keys():
    vals = []
    for row in train_data:
        vals.append(float(row[key]))
    medians[key] = statistics.median(vals)

In [18]:
medians

{'X1': 140000.0,
 'X5': 34.0,
 'X12': 22330.0,
 'X13': 21339.0,
 'X14': 20039.0,
 'X15': 18940.5,
 'X16': 18107.5,
 'X17': 17036.0,
 'X18': 2100.0,
 'X19': 2000.0,
 'X20': 1702.5,
 'X21': 1500.0,
 'X22': 1500.0,
 'X23': 1500.0}

In [19]:
#if >median, 1 else 0
for key, value in medians.items():
    for row in train_data:
        val = float(row[key])
        row[key] = 1 if val > value else 0

    for row in test_data:
        val = float(row[key])
        row[key] = 1 if val > value else 0

In [20]:
#initialize the weight
train_size=len(train_data)
test_size=len(test_data)
for row in train_data:
    row['weights'] = 1/train_size

for row in test_data:
    row['weights'] = 1/test_size
    

In [21]:
def update_weight(data):
    update_w = 1/len(data)
    for row in data:
        row['weights'] = update_w

In [22]:
import random

In [None]:
train_T=[]
test_T=[]
for T in range(100):
    trees = cls(train_data,0, attributes, labels, 1,0,T,2)
    h_train = prediction(train_data, trees)
    h_test = prediction(test_data, trees)
    h_x=1-h_train
    h_y=1-h_test
    train_T.append(h_x)
    test_T.append(h_y)
x1=[x for x in range(100) ]
y1=train_T
y2=test_T
fig, ax = plt.subplots(figsize = (6,4))
ax.plot(x1,y1, label='train error', c='black')
ax.plot(x1,y2, label='test error', c='red')
ax.legend()
ax.set_title("Error per iteration for RF")
ax.set_xlabel('iteration')
ax.set_ylabel('error')

plt.show()