In [40]:
import math
import copy
import numpy as np 
import statistics
import matplotlib.pyplot as plt

In [2]:
class Node:
    def __init__(self, label):
        self.label = label
        self.children = dict()

    def isLeaf(self):
        return len(self.children) == 0

In [28]:
def entropy(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    h = 0.0
    norm = _weighted(data)
    for (label, count) in counts.items():
        ratio = count / (norm)
        h -= math.log(ratio, 2) * ratio

    return h

def gini_index(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    h = 0.0
    norm =  _weighted(data)
    for (label, count) in counts.items():
        ratio = count / float(norm)
        h +=ratio **2

    return h

def ME(data):
    if len(data) == 0:
        return 0
    
    counts = dict()

    for row in data:
        label = row['y']
        if label not in counts:
            counts[label] = 0.0
        counts[label] += row['weights']

    h = 0.0
    norm = _weighted(data)
    for (label, count) in counts.items():
        ratio = count / float(norm)
        h = max(h, ratio)

    return 1-h



def info_gain(data,gain_type,attribute, vals):
    
    h = None
    new_h = 0.0
    if gain_type == 0:
        h=entropy(data)
           
    elif gain_type == 1:
        h=gini_index(data)
        
    elif gain_type == 2:
        h=ME(data)
            
    for val in vals:
        sub_data = set_subdata(data, attribute, val)
        ratio = _weighted(sub_data) / float(_weighted(data))
        new_h += ratio * h
        
    return h - new_h

In [4]:
def majority_label(data):
    
    counts = dict()
    for row in data:
        label = row['y']
        
        if label not in counts:
            counts[label] = 0.0
            
        counts[label] += row['weights']
        
    common_label=max(counts.keys(), key=lambda key: counts[key])

    return common_label

In [36]:
def WID3(data,gain_type, attributes, labels, max_depth,depth):
    
    if (len(labels) == 1) or (len(attributes) == 0) or depth==max_depth:
        label = majority_label(data)
        
        return Node(label)

    #recursion
    max_attr = select_feature(data,gain_type,attributes)
    root = Node(max_attr)
    

    # split into subsets
    for v in attributes[max_attr]:
        sub_data = set_subdata(data, max_attr, v)

        if len(sub_data) == 0:
            label = majority_label(data)
            root.children[v] = Node(label)
            
        else:
            
            sub_attributes = copy.deepcopy(attributes)
            sub_attributes.pop(max_attr)

            # update subset labels set
            sub_labels = set()
            for row in sub_data:
                sub_label = row['y']
                if  sub_labels not in sub_labels:
                    sub_labels.add(sub_label)

            # recursion
            root.children[v] = WID3(sub_data, gain_type,sub_attributes, sub_labels, max_depth, depth+1)

    return root

In [6]:
def select_feature(data,gain_type, attributes):
    gain_x= dict()

    for ln, lv in attributes.items():
        gain = info_gain(data,gain_type, ln, lv)
        gain_x[ln] = gain
        max_attr=max(gain_x.keys(), key=lambda key: gain_x[key])

    return max_attr

In [7]:
def set_subdata(data, attribute, val):
    sub_data = []

    for row in data:
        if row[attribute] == val:
            sub_data.append(row)

    return sub_data

In [8]:
def _weighted(data):
    
    length = 0.0
    for row in data:
        length += row['weights']
    
    return length

def get_label(row, root):
    new_node = root

    while not new_node.isLeaf():
        curr_attr = new_node.label
        attr_val = row[curr_attr]
        new_node = new_node.children[attr_val]

    return new_node.label

def weighted_error(data, root):
    
    error = 0.0
    for row in data:
        label = get_label(row, root)
        if label != row['y']:
            error += row['weights']

    return error

In [9]:
def AdaBoost(data, attributes, labels, T):
    
    alphas = []
    DT=[]

    for t in range(0, T):
        dt= WID3(data, attributes, labels, max_depth=1,depth=0)
        DT.append(dt)

        # votes
        error = weighted_error(data, dt)
        alpha = 0.5 * math.log((1-error)/error)
        alphas.append(alpha)

        # update weights
        norm = 0.0
        for row in data:
            label = get_label(row, dt)
            
            if label != row['y']:
                w_new = row['weights'] * math.exp(alpha)
            else:
                w_new = row['weights'] * math.exp(-alpha)

            row['weights'] = w_new
            norm += w_new

        # normalize weights
        for row in data:
            row['weights'] /= norm

    return DT, alphas

In [10]:
def _predict(data, DT, alphas):
    h_rate = 0

    for row in data:
        prediction = 0.0
        
        for dt, alpha in zip(DT, alphas):
            label = get_label(row, tree)
            label = 1 if label == 'yes' else -1
            prediction += label * alpha

        if row['y'] == 'yes' and prediction > 0:
            h_rate += 1
        if row['y'] == 'no' and prediction < 0:
            h_rate += 1
    
    return h_rate/len(data)


In [11]:
attributes = {'age': [0, 1], 
        'job': ['admin.', 'unknown', 'unemployed', 'management', 'housemaid', 'entrepreneur', 'student', 'blue-collar', 'self-employed', 'retired', 'technician', 'services'], 
        'marital': ['married','divorced','single'], 
        'education': ['unknown', 'secondary', 'primary', 'tertiary'],
        'default': ['yes', 'no'],
        'balance': [0, 1], 
        'housing': ['yes', 'no'],
        'loan': ['yes', 'no'],
        'contact': ['unknown', 'telephone', 'cellular'],
        'day': [0, 1],  
        'month': ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
        'duration': [0, 1],
        'campaign': [0, 1], 
        'pdays': [0, 1], 
        'previous': [0, 1], 
        'poutcome': ['unknown', 'other', 'failure', 'success']}

In [12]:
if __name__ == '__main__':
    
    columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
    labels = {'yes', 'no'}
    numeric_attrs = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [13]:
# read training data
train_data=[]
with open('train.csv', 'r') as f:
    for line in f:
        example = dict()
        terms = line.strip().split(',')
        for i in range(len(terms)):
            attrName = columns[i]
            example[attrName] = terms[i]
        
        train_data.append(example)

In [14]:
# read test data
test_data=[]
with open('test.csv', 'r') as f:
    for line in f:
        example = dict()
        terms = line.strip().split(',')
        for i in range(len(terms)):
            attrName = columns[i]
            example[attrName] = terms[i]
        
        test_data.append(example)

In [15]:
#convert numeric variable to binary variable
medians = {'age': 0.0,'balance': 0.0,'day': 0.0,'duration': 0.0,'campaign': 0.0,'pdays': 0.0,'previous': 0.0}
 
for key in medians.keys():
    vals = []
    for row in train_data:
        vals.append(float(row[key]))
    medians[key] = statistics.median(vals)


In [16]:
#if >median, 1 else 0
for key, value in medians.items():
    for row in train_data:
        val = float(row[key])
        row[key] = 1 if val > value else 0

    for row in test_data:
        val = float(row[key])
        row[key] = 1 if val > value else 0

In [17]:
#initialize the weight
train_size=len(train_data)
test_size=len(test_data)
for row in train_data:
    row['weights'] = 1/train_size

for row in test_data:
    row['weights'] = 1/test_size
    
#updated the weight
def weights_updated(example):
    w_updated = 1/len(example)
    for row in example:
        row['weights'] = w_updated

In [38]:
def _error(train_data, test_data, attributes, labels, T):
    
    train_errors = []
    test_errors = []

    for t in range(0, T):
       
        dt= WID3(train_data,0, attributes, labels, 1, 0)

        # calculate votes
        train_err = weighted_error(train_data, dt)
        test_err= weighted_error(test_data, dt)
        
        train_errors.append(train_err)
        test_errors.append(test_err)
        
        alpha = 0.5 * math.log((1-train_err)/train_err)

        # update weights
        norm = 0.0
        for row in train_data:
            label = get_label(row, dt)
            
            if label != row['y']:
                w_new = row['weights'] * math.exp(alpha)
            else:
                w_new = row['weights'] * math.exp(-alpha)

            row['weights'] = w_new
            norm += w_new

        # normalize weights
        for row in train_data:
            row['weights'] /= norm

    return train_errors,test_errors

In [None]:
# T=500, plot training and test error in each iteration
train_e, test_e = _error(train_data, test_data, attributes, labels, 500)
t = [i+1 for i in range(0,500)]

fig, ax = plt.subplots(figsize = (6,4))
ax.plot(t, test_e, label='test error', c='blue', alpha=0.3)
ax.plot(t,train_e, label='training error', c='red', alpha=0.3)
ax.legend()
ax.set_title("Error per iteration for Adaboost")
ax.set_xlabel('iteration')
ax.set_ylabel('error')

plt.show()