In [1]:
from collections import defaultdict
import numpy as np
from random import randrange


"""
YOU MUST!!!
Read all the lines of the code provided to you and understand what it does!
"""



class DecisionNode(object):
    """
    README
    DecisionNode is a building block for Decision Trees.
    DecisionNode is a python class representing a  node in our decision tree
    node = DecisionNode()  is a simple usecase for the class
    you can also initialize the class like this:
    node = DecisionNode(column = 3, value = "Car")
    In python, when you initialize a class like this, its __init__ method is called 
    with the given arguments. __init__() creates a new object of the class type, and initializes its 
    instance attributes/variables.
    In python the first argument of any method in a class is 'self'
    Self points to the object which it is called from and corresponds to 'this' from Java

    """
    
    def __init__(self,
                 column=None,
                 value=None,
                 false_branch=None,
                 true_branch=None,
                 current_results=None,
                 is_leaf=False,
                 results=None):
        self.column = column
        self.value = value
        self.false_branch = false_branch
        self.true_branch = true_branch
        self.current_results = current_results
        self.is_leaf = is_leaf
        self.results = results


def dict_of_values(data):
    """
        param data: a 2D Python list representing the data. Last column of data is Y.
    return: returns a python dictionary showing how many times each value appears in Y

    for example 
    data = [[1,'yes'],[1,'no'],[1,'yes'],[1,'yes']]
    dict_of_values(data)
    should return {'yes' : 3, 'no' :1}
        """
    results = defaultdict(int)
    for row in data:
        r = row[len(row) - 1]
        results[r] += 1
    return dict(results)


def divide_data(data, feature_column, feature_val):
    """
    this function takes the data and divides it in two parts by a line. A line
    is defined by the feature we are considering (feature_column) and the target 
    value. The function returns a tuple (data1, data2) which are the desired parts of the data.
    For int or float types of the value, data1 have all the data with values >= feature_val
    in the corresponding column and data2 should have rest.
    For string types, data1 should have all data with values == feature val and data2 should 
    have the rest.

    param data: a 2D Python list representing the data. Last column of data is Y.
    param feature_column: an integer index of the feature/column.
    param feature_val: can be int, float, or string
    return: a tuple of two 2D python lists
        """
    #empty lists
    data1 = []
    data2 = []
    
    if type(feature_val) == int or type(feature_val) == float or type(feature_val) == np.float64:
        for datum in data:
            if datum[feature_column] >= feature_val:
                data1.append(datum)
            else:
                data2.append(datum)
    elif type(feature_val) == str:

        for datum in data:
            if datum[feature_column] == feature_val:
                data1.append(datum)
            else:
                data2.append(datum)

    return data1, data2


def gini_impurity(data1, data2):

    """
    Given two 2D lists of compute their gini_impurity index. 
    Remember that last column of the data lists is the Y
    Lets assume y1 is y of data1 and y2 is y of data2.
    gini_impurity shows how diverse the values in y1 and y2 are.
    gini impurity is given by 

    N1*sum(p_k1 * (1-p_k1)) + N2*sum(p_k2 * (1-p_k2))

    where N1 is number of points in data1
    p_k1 is fraction of points that have y value of k in data1
    same for N2 and p_k2


    param data1: A 2D python list
    param data2: A 2D python list
    return: a number - gini_impurity 
    """
    #get length of data1 and data2
    N1 = len(data1)
    N2 = len(data2)
    
    #use dict_of_values to get the answers  for data1 and data2
    data1_answ = dict_of_values(data1)
    data2_answ = dict_of_values(data2)
    
    #initialization
    data1_gini = 0
    data2_gini = 0
    
    #calculating
    if N1 != 0:
        data1_gini = sum([(val / N1)*(1.0 - (val / N1)) for val in data1_answ.values()])
    if N2 != 0:
        data2_gini = sum([(val / N2)*(1.0 - (val / N2)) for val in data2_answ.values()])
    
    return N1*data1_gini + N2*data2_gini

def get_split(dataset):
    """
        Select the best split point for a dataset
    """
    b_column, b_value, b_gini, b_split = None, None, 1e10, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            split = divide_data(dataset, index, row[index])
            gini = gini_impurity(split[0], split[1])
            if gini < b_gini:
                b_column, b_value, b_gini, b_split = index, row[index], gini, split
    return (b_column, b_value, b_gini, b_split)


def build_tree(data, current_depth=0, max_depth=1e10):
    """
    build_tree is a recursive function.
    What it does in the general case is:
    1: find the best feature and value of the feature to divide the data into
    two parts
    2: divide data into two parts with best feature, say data1 and data2
        recursively call build_tree on data1 and data2. this should give as two 
        trees say t1 and t2. Then the resulting tree should be 
        DecisionNode(...... true_branch=t1, false_branch=t2) 


    In case all the points in the data have same Y we should not split any more, 
    and return that node
    For this function we will give you some of the code so its not too hard for you ;)
    
    param data: param data: A 2D python list
    param current_depth: an integer. This is used if we want to limit the numbr of layers in the
        tree
    param max_depth: an integer - the maximal depth of the representing
    return: an object of class DecisionNode

    """
    if len(data) == 0:

        return DecisionNode(is_leaf=True)

    if current_depth == max_depth:

        return DecisionNode(current_results=dict_of_values(data))

    if len(dict_of_values(data)) == 1:

        return DecisionNode(current_results=dict_of_values(data), is_leaf=True)

    #This calculates gini number for the data before dividing 
    self_gini = gini_impurity(data, [])

    
    best_column, best_value, best_gini, best_split = get_split(data)
    
    
    #if best_gini is no improvement from self_gini, we stop and return a node.
    if abs(self_gini - best_gini) < 1e-10:

        return DecisionNode(current_results=dict_of_values(data), is_leaf=True)

    else:
        
        #recursively call build tree, construct the correct return argument and return
        t1 = build_tree(best_split[0], current_depth=current_depth+1, max_depth=1e10)
        t2 = build_tree(best_split[1], current_depth=current_depth+1, max_depth=1e10)

        return DecisionNode(current_results=dict_of_values(data), column=best_column,
                            value=best_value, true_branch = t1, false_branch = t2)
    



def print_tree(tree, indent='^'):
    # Is this a leaf node?
    if tree.is_leaf:
        print(str(tree.current_results))
    else:
        # Print the criteria
        #         print (indent+'Current Results: ' + str(tree.current_results))
        print('Column ' + str(tree.column) + ' : ' + str(tree.value) + '? ')

        # Print the branches
        print(indent + 'True->', end="")
        print_tree(tree.true_branch, indent + '  ')
        print(indent + 'False->', end="")
        print_tree(tree.false_branch, indent + '  ')


In [2]:
class DecisionTree(object):
    """
    DecisionTree class, that represents one Decision Tree

    :param max_tree_depth: maximum depth for this tree.
    """
    def __init__(self, max_tree_depth):
        self.max_depth = max_tree_depth
    
    
        
    def fit(self, X, Y):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :param Y: 1 dimensional python list or numpy 1 dimensional array
        """

        data = np.column_stack((X, Y))
      
        self.tree = build_tree(data, max_depth=self.max_depth)

    def predict(self, X):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :return: Y - 1 dimension python list with labels
        """
        Y = []
        tree = self.tree
        
        for i in range(len(X)):
            row = X[i]
            tree = self.tree

            while tree.is_leaf == False:
                
                if row[tree.column] >= tree.value:
                    tree = tree.true_branch
                else:
                    tree = tree.false_branch

                if tree.is_leaf:
                    dict = tree.current_results
                    keys = list(dict.keys())
                    Y.append(int(keys[0]))
            
        return Y


In [3]:
class RandomForest(object):
    """
    RandomForest a class, that represents Random Forests.

    :param num_trees: Number of trees in the random forest
    :param max_tree_depth: maximum depth for each of the trees in the forest.
    :param ratio_per_tree: ratio of points to use to train each of
        the trees.
    """
    def __init__(self, num_trees, max_tree_depth, ratio_per_tree=0.5):
        self.num_trees = num_trees
        self.max_tree_depth = max_tree_depth
        self.ratio_per_tree = ratio_per_tree
        self.trees = None
    
    
 

    def fit(self, X, Y):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :param Y: 1 dimensional python list or numpy 1 dimensional array
        """
        #empty forest
        self.trees = []
        
        for i in range(self.num_trees):
            samplex, sampley = subsample(X, Y, self.ratio_per_tree)
            tree = DecisionTree(self.max_tree_depth)
            tree.fit(samplex, sampley)
            self.trees.append(tree)
    


          

    def predict(self, X):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :return: (Y, conf), tuple with Y being 1 dimension python
        list with labels, and conf being 1 dimensional list with
        confidences for each of the labels.
        """
        Y = [bagging_predict(self.trees, row) for row in X]
        
        return (Y)

def subsample(dataset, responses, ratio):
    """
    Create a random subsample from the dataset with replacement
    
    """
    samplex = list()
    sampley = list()
    
    n_sample = round(len(dataset) * ratio)
    while len(samplex) < n_sample:
        index = randrange(len(dataset))
        samplex.append(dataset[index])
        sampley.append(responses[index])
    return samplex, sampley 


def bagging_predict(trees, row):
    '''
    Param: trees -- list of trees(forest)
    Param: row -- a row from X
    Process: calculate prediction using confidence coeficent
    '''
    predictions = [tree.predict([row]) for tree in trees]
    max1 = 0
    for item in predictions:
        if item == [1]:
            max1 += 1
    
    if max1 >= len(predictions) - max1:
        return  1 
    return 0

In [4]:
'''
    This module compute logistic regression model
    It uses stohastic gradient descent(SGD)
'''

def column_means(dataset):
    """
    Param: dataset matirx of our futures
    The fisrt element of each of row in dataset is a 1
    Process: calculate column means
    """
    means = [0 for i in range(len(dataset[0]))]
    for i in range(1, len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means


def column_stdevs(dataset, means):
    """
    Param: dataset matirx of our futures
    Param: means is a vector of mean values for each column
    Process: calculate column standard deviations
    """
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(1, len(dataset[0])):
        variance = [(row[i]-means[i])**2 for row in dataset]
        stdevs[i] = sum(variance)
    stdevs = [(x/(float(len(dataset)-1)))**0.5 for x in stdevs]
    return stdevs


def standardize_dataset(dataset, means, stdevs):
    """
    Param: dataset matirx of our futures
    Param: means is a vector of mean values for each column
    Param: stdevs os avecor of std for each column
    Process:standardize dataset
    """
    for row in dataset:
        for i in range(1, len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]
def norm(a):
    """
    Param: a is a vector
    Process: calculate norm of 'a'
    """
    a = np.array(a)
    return (np.sum(a**2))**0.5


def rescaleBeta(beta, means, std):
    """
    Param: beta is a vector of our hypothesys
    Param: means is a vector of mean values for each column
    Param: stdevs os avecor of std for each column
    Process: rescale beta
    """
    beta[0] = beta[0] - sum([(means[i]*beta[i])/float(std[i]) for i in range(1, len(beta))])
    for i in range(1, beta.shape[0]):
        beta[i] = beta[i]/float(std[i])

def sigmoid(s):
    """
    Param: s is a number i.e int or float
    Process: calculate sigmoid function in this point('s')
    """
    return 1.0 / (1 + np.exp(-s))

def normalized_gradient(X, Y, beta, lyabdaVector):
    """
    :param X: data matrix (2 dimensional np.array)
    :param Y: response variables (1 dimensional np.array)
    :param beta: value of beta (1 dimensional np.array)
    :param l: regularization parameter lambda
    :return: normalized gradient, i.e. gradient normalized according to data
    """
    N = X.shape[0]
    gradient = np.zeros(X.shape[1])
    gradient = gradient.astype(float)
    for i in range(N):
        gradient += (-1)*Y[i]*X[i]*(1-sigmoid(Y[i]*X[i].dot(beta)))
    gradient += 2*lyabdaVector.dot(beta)

    return gradient/float(N)
def gradient_descent(X, Y, epsilon=1e-6, l=1, step_size=1e-4, max_steps=1000):
    """
    Implement gradient descent using full value of the gradient.
    :param X: data matrix (2 dimensional np.array)
    :param Y: response variables (1 dimensional np.array)
    :param l: regularization parameter lambda
    :param epsilon: approximation strength
    :param max_steps: maximum number of iterations before algorithm will
        terminate.
    :return: value of beta (1 dimensional np.array)
    """
    X = X.astype(float)
    means = column_means(X)
    std = column_stdevs(X, means)
    standardize_dataset(X, means, std)

    lyabdaVector = [0]
    for i in range(1, len(std)):
        lyabdaVector.append((l)/((std[i]))**2)
    lyabdaVector = np.array(lyabdaVector)
    beta = np.random.random(X.shape[1])
    n = X.shape[0]
    arange = np.arange(n)
    np.random.shuffle(arange)

    for s in range(max_steps):
        # for each training sample, compute the gradient
        index = arange[(s)%n]

        gradient = normalized_gradient(X[index:index+1], Y[index:index+1], beta, lyabdaVector)      
        # update the beta_temp
        prevBeta = beta
        beta = beta - step_size * gradient
        dif_beta = beta - prevBeta
        step_size = step_size - 0.0000000000000000000001
        
        
        if norm(dif_beta)/norm(beta) < epsilon:
            print('Converged, iterations:simple gradient ', s, '!!!')
            break

    rescaleBeta(beta, means, std)
    return beta

def loss(X, Y, beta):
    """
        Compute loss function
    """
    return  sum([np.log(1 + np.exp(-Y[i]*X[i].dot(beta))) for i in range(X.shape[0])])

def logistic_predict(Xtrain, ytrain, Xtest, ytest):
    """
    Param: Xtrain for train SGD model
    Param: ytrain response vector for Xtrain
    Param: Xtest for train SGD model
    Param: ytrain response vector for Xtrain
    """
    
    Xtrain = np.array(Xtrain)
    ytrain = np.array(ytrain)
    Xtest = np.array(Xtest)
    ytest = np.array(ytest)
    
    one_s1 = np.ones(len(Xtrain))
    one_s2 = np.ones(len(Xtest))

    
    for row1, row2 in zip(Xtrain, Xtest):
        row1 = np.array(row1)
        row2 = np.array(row2)

    
    Xtrain = np.column_stack((one_s1, Xtrain))
    Xtest = np.column_stack((one_s2, Xtest))

   

    # normalize ytrain and ytest -->[-1,1]
    for i in range(len(ytrain)):
        if ytrain[i] == 0:
            ytrain[i] = -1
    
    for i in range(len(ytest)):
        if ytest[i] == 0:
            ytest[i] = -1

    beta = gradient_descent(Xtrain, ytrain, epsilon=1e-6, l=1, step_size=1e-2, max_steps=2500)
    responses = Xtest.dot(beta)
    Y = []
    for resp in responses:
        if resp > 0:
            Y.append(1)
        else:
            Y.append(0)

    return Y


In [5]:
def accuracy_score(Y_true, Y_predict):
    """
    Param: Y_true real labels
    Param : Y_predict predicted lables
    Process: Calculate accuracy_score
    """
    correct = 0
    for i in range(len(Y_true)):
        if Y_true[i] == Y_predict[i]:
            correct += 1
    return correct/(len(Y_true))

def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n = X.shape[0]
    
    
    print('part of data (data[0:10]) = \n {}'.format(data[:10]))
    k_folds = 10
    n = int(X.shape[0]/k_folds)* k_folds
    
    all_accuracies_tree = list()
    all_accuracies_randforest = list()
    all_accuracies_log = list()


    
    for trial in range(1):
        
        idx = np.arange(n)
        np.random.seed(trial)
        np.random.shuffle(idx)
        indexes = np.split(idx, k_folds)
        
        for i in range(k_folds):
            
            train_set = list(indexes)

            a = train_set[i]
            train_set.pop(i)
                
                
            Xtest = X[a]
            ytest = y[a]
               
            Xtrain = []
            ytrain = []
            for ff in train_set:
                for row1, row2 in zip(X[ff], y[ff]):
                    Xtrain.append(row1)
                    ytrain.append(row2)
                
            
            # train the decision tree
            classifier = DecisionTree(100)
            classifier.fit(Xtrain, ytrain)
            y_pred = classifier.predict(Xtest)
            accuracy1 = accuracy_score(ytest, y_pred)
            all_accuracies_tree.append(accuracy1)

            
            #train the random forest
            classifier = RandomForest(10, 50, 0.1)
            classifier.fit(Xtrain, ytrain)
            y_pred = classifier.predict(Xtest)
            accuracy2 = accuracy_score(ytest, y_pred)
            all_accuracies_randforest.append(accuracy2)


            # train by logostic regrresion
            y_pred = logistic_predict(Xtrain, ytrain, Xtest, ytest)
            accuracy3 = accuracy_score(ytest, y_pred)
            all_accuracies_log.append(accuracy3)


    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(all_accuracies_tree)
    stddevDecisionTreeAccuracy = np.std(all_accuracies_tree)
    
    meanRandomForestAccuracy = np.mean(all_accuracies_randforest)
    stddevRandomForestAccuracy = np.std(all_accuracies_randforest)
    
    
    
    meanLogisticRegressionAccuracy = np.mean(all_accuracies_log)
    stddevLogisticRegressionAccuracy = np.std(all_accuracies_log)
    

    # make certain that the return value matches the API specification
    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats


# Do not modify from HERE...
if __name__ == "__main__":
    stats = evaluate_performance()
    print("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")
    print("Random Forest Tree Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
    print("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")




# ...to HERE.

part of data (data[0:10]) = 
 [[  1.  59.  52.  70.  67.  73.  66.  72.  61.  58.  52.  72.  71.  70.
   77.  66.  65.  67.  55.  61.  57.  68.  66.  72.  74.  63.  64.  56.
   54.  67.  54.  76.  74.  65.  67.  66.  56.  62.  56.  72.  62.  74.
   74.  64.  67.]
 [  1.  72.  62.  69.  67.  78.  82.  74.  65.  69.  63.  70.  70.  72.
   74.  70.  71.  72.  75.  66.  65.  73.  78.  74.  79.  74.  69.  69.
   70.  71.  69.  72.  70.  62.  65.  65.  71.  63.  60.  69.  73.  67.
   71.  56.  58.]
 [  1.  71.  62.  70.  64.  67.  64.  79.  65.  70.  69.  72.  71.  68.
   65.  61.  61.  73.  71.  75.  74.  80.  74.  54.  47.  53.  37.  77.
   68.  72.  59.  72.  68.  60.  60.  73.  70.  66.  65.  64.  55.  61.
   41.  51.  46.]
 [  1.  69.  71.  70.  78.  61.  63.  67.  65.  59.  59.  66.  69.  71.
   75.  65.  58.  60.  55.  62.  59.  67.  66.  74.  74.  64.  60.  57.
   54.  70.  73.  69.  76.  62.  64.  61.  61.  66.  65.  72.  73.  68.
   68.  59.  63.]
 [  1.  70.  66.  61.  66.  61.  5