In [1]:
import numpy as np
from collections import defaultdict
from numpy.distutils.system_info import numarray_info


class DecisionNode(object):
    """
    DecisionNode is a building block for Decision Trees.
    DecisionNode is a python class representing a node in our decision tree
    """
    def __init__(self,
                 column=None,
                 value=None,
                 false_branch=None,
                 true_branch=None,
                 current_results=None,
                 is_leaf=False,
                 results=None):
        self.column = column
        self.value = value
        self.false_branch = false_branch
        self.true_branch = true_branch
        self.current_results = current_results
        self.is_leaf = is_leaf
        self.results = results
        
def dict_of_values(data):
    """
    param data: a 2D Python list representing the data. Last column of data is Y.
    return: returns a python dictionary showing how many times each value appears in Y
    """
    results = defaultdict(int)
    for row in data:
        r = row[len(row) - 1]
        results[r] += 1
    return dict(results)

def divide_data(data, feature_column, feature_val):
    """
    This function takes the data and divides it in two parts by a line. A line
    is defined by the feature we are considering (feature_column) and the target 
    value. The function returns a tuple (data1, data2) which are the desired parts of the data.
    For int or float types of the value, data1 have all the data with values >= feature_val
    in the corresponding column and data2 should have rest.
    For string types, data1 should have all data with values == feature val and data2 should 
    have the rest.

    param data: a 2D Python list representing the data. Last column of data is Y.
    param feature_column: an integer index of the feature/column.
    param feature_val: can be int, float, or string
    return: a tuple of two 2D python lists
    """
    data1 = []
    data2 = []
    features = [row[feature_column] for row in data]
    if type(feature_val) == int or type(feature_val) == float:
        for i,elem in enumerate(features):
            if elem >= feature_val:
                data1.append(data[i])
            else:
                data2.append(data[i])
    elif type(feature_val) == str:
        for i,elem in enumerate(features):
            if elem == feature_val:
                data1.append(data[i])
            else:
                data2.append(data[i])
                
    return (data1, data2)

def gini_impurity(data1, data2):
    """
    Given two 2D lists of compute their gini_impurity index. 
    Last column of the data lists is the Y. Lets assume y1 is y of data1 and y2 is y of data2.
    gini_impurity shows how diverse the values in y1 and y2 are.
    gini impurity is given by 

    N1*sum(p_k1 * (1-p_k1)) + N2*sum(p_k2 * (1-p_k2))

    where N1 is number of points in data1
    p_k1 is fraction of points that have y value of k in data1
    same for N2 and p_k2

    param data1: A 2D python list
    param data2: A 2D python list
    return: a number - gini_impurity 
    """
    N1 = len(data1)
    N2 = len(data2)
    Y_dict1 = dict_of_values(data1)
    Y_dict2 = dict_of_values(data2)
    p_k1 = [value/N1 for value in Y_dict1.values()]
    p_k2 = [value/N2 for value in Y_dict2.values()]
    sum1 = N1 * sum([i*(1-i) for i in p_k1])
    sum2 = N2 * sum([i*(1-i) for i in p_k2])
    return sum1 + sum2

def build_tree(data, current_depth=0, max_depth=1e10):
    """
    build_tree is a recursive function.
    What it does in the general case is:
    1: find the best feature and value of the feature to divide the data into
    two parts
    2: divide data into two parts with best feature, say data1 and data2
        recursively call build_tree on data1 and data2. This should give as two 
        trees say t1 and t2. Then the resulting tree should be 
        DecisionNode(...... true_branch=t1, false_branch=t2)

    In case all the points in the data have same Y we should not split any more, and return that node
    
    param data: param data: A 2D python list
    param current_depth: an integer. This is used if we want to limit the numbr of layers in the tree
    param max_depth: an integer - the maximal depth of the representing
    return: an object of class DecisionNode
    """
    depth = 0
    if len(data) == 0:
        return DecisionNode(is_leaf=True)

    if(current_depth == max_depth):
        return DecisionNode(current_results=dict_of_values(data))

    if(len(dict_of_values(data)) == 1):
        return DecisionNode(current_results=dict_of_values(data), is_leaf=True)
    #This calculates gini number for the data before dividing
    self_gini = gini_impurity(data, [])
    #Below are the attributes of the best division that we need to find. 
    #We need to update these when we find a division which is better
    best_gini = 1e10
    best_column = None
    best_value = None
    #best_split is tuple (data1,data2) which shows the two datas for the best divison so far
    best_split = None    
    
    for col_index in range(len(data[0])-1):
        feature_set = set({data[i][col_index] for i in range(len(data))})
        for feature in feature_set:
            (data1_temp, data2_temp) = divide_data(data, feature_column=col_index, feature_val=feature)
            if gini_impurity(data1_temp, data2_temp) < best_gini:
                data1 = data1_temp
                data2 = data2_temp
                best_value = feature
                best_split = (data1, data2)
                best_column = col_index
                best_gini = gini_impurity(data1, data2)
    
    #if best_gini is no improvement from self_gini, we stop and return a node.
    if abs(self_gini - best_gini) < 1e-10:
        return DecisionNode(current_results=dict_of_values(data), is_leaf=True)
    else:
        tree1 = build_tree(data1, current_depth=depth+1, max_depth=1e10)
        tree2 = build_tree(data2, current_depth=depth+1, max_depth=1e10)
        return DecisionNode(column=best_column,value=best_value,
                            current_results=dict_of_values(data),true_branch=tree1, false_branch=tree2)        
    
def print_tree(tree, indent=''):
    if tree.is_leaf:
        print(str(tree.current_results))
    else:
        print('Column ' + str(tree.column) + ' : ' + str(tree.value) + '? ')
        
        # Print the branches
        print(indent + 'True->', end="")
        print_tree(tree.true_branch, indent + '  ')
        print(indent + 'False->', end="")
        print_tree(tree.false_branch, indent + '  ')

In [2]:
class DecisionTree(object):
    """
    DecisionTree class, that represents one Decision Tree

    :param max_tree_depth: maximum depth for this tree.
    """
    def __init__(self, max_tree_depth):
        self.max_depth = max_tree_depth

    def fit(self, X, Y):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :param Y: 1 dimensional python list or numpy 1 dimensional array
        """
        X = np.array(X)
        Y = np.array(Y)
        our_data = np.column_stack((X,Y))
        our_data = our_data.tolist()
        self.tree = build_tree(our_data, 0,max_depth = self.max_depth)

    def predict(self, X):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :return: Y - 1 dimension python list with labels
        """
        X = np.array(X)
        Y = []
        
        for i in range(X.shape[0]):
            tree = self.tree
            while not tree.is_leaf:
            
                if X[i][tree.column] >= tree.value:
                    tree = tree.true_branch
                else:
                    tree = tree.false_branch
            Y.append([elem for elem in tree.current_results.keys()])
            
        return Y


In [3]:
class RandomForest(object):
    """
    RandomForest a class, that represents Random Forests.

    :param num_trees: Number of trees in the random forest
    :param max_tree_depth: maximum depth for each of the trees in the forest.
    :param ratio_per_tree: ratio of points to use to train each of
        the trees.
    """
    def __init__(self, num_trees, max_tree_depth, ratio_per_tree=0.5):
        self.num_trees = num_trees
        self.max_tree_depth = max_tree_depth
        self.ratio_per_tree = ratio_per_tree
        self.trees = None

    def fit(self, X, Y):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :param Y: 1 dimensional python list or numpy 1 dimensional array
        """
        self.trees = []
        for i in range(self.num_trees):
            idx = np.arange(int(X.shape[0]* self.ratio_per_tree))
            np.random.shuffle(idx)
            X_train = X[idx]
            Y_train = Y[idx]
            temp = DecisionTree(self.max_tree_depth)
            temp.fit(X_train.tolist(), Y_train)
            self.trees.append(temp)
        return self.trees

    def predict(self, X):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :return: (Y, conf), tuple with Y being 1 dimension python
        list with labels, and conf being 1 dimensional list with
        confidences for each of the labels.
        """
        predicts = []
        Y = []
        for i in range(self.num_trees):
            predicts.append(self.trees[i].predict(X))

        for i in range(len(predicts[0])):
            max = predicts[0][i]
            for j in range(len(predicts)):
                if max < predicts[j][i]:
                    max = predicts[j][i]
            Y.append(max)
        conf = []
        for i in range(len(predicts[0])):
            q = 0.0
            for j in range(len(predicts)):
                if Y[i] == predicts[j][i]:
                    q += 1
            conf.append(q / len(predicts[0]))


        return (Y, conf)

In [4]:
'''
    This module compute logistic regression model
    It uses stohastic gradient descent(SGD)
'''

def column_means(dataset):
    """
    Param: dataset matirx of our futures
    The fisrt element of each of row in dataset is a 1
    Process: calculate column means
    """
    means = [0 for i in range(len(dataset[0]))]
    for i in range(1, len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means


def column_stdevs(dataset, means):
    """
    Param: dataset matirx of our futures
    Param: means is a vector of mean values for each column
    Process: calculate column standard deviations
    """
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(1, len(dataset[0])):
        variance = [(row[i]-means[i])**2 for row in dataset]
        stdevs[i] = sum(variance)
    stdevs = [(x/(float(len(dataset)-1)))**0.5 for x in stdevs]
    return stdevs


def standardize_dataset(dataset, means, stdevs):
    """
    Param: dataset matirx of our futures
    Param: means is a vector of mean values for each column
    Param: stdevs os avecor of std for each column
    Process:standardize dataset
    """
    for row in dataset:
        for i in range(1, len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]
def norm(a):
    """
    Param: a is a vector
    Process: calculate norm of 'a'
    """
    a = np.array(a)
    return (np.sum(a**2))**0.5


def rescaleBeta(beta, means, std):
    """
    Param: beta is a vector of our hypothesys
    Param: means is a vector of mean values for each column
    Param: stdevs os avecor of std for each column
    Process: rescale beta
    """
    beta[0] = beta[0] - sum([(means[i]*beta[i])/float(std[i]) for i in range(1, len(beta))])
    for i in range(1, beta.shape[0]):
        beta[i] = beta[i]/float(std[i])

def sigmoid(s):
    """
    Param: s is a number i.e int or float
    Process: calculate sigmoid function in this point('s')
    """
    return 1.0 / (1 + np.exp(-s))

def normalized_gradient(X, Y, beta, lyabdaVector):
    """
    :param X: data matrix (2 dimensional np.array)
    :param Y: response variables (1 dimensional np.array)
    :param beta: value of beta (1 dimensional np.array)
    :param l: regularization parameter lambda
    :return: normalized gradient, i.e. gradient normalized according to data
    """
    N = X.shape[0]
    gradient = np.zeros(X.shape[1])
    gradient = gradient.astype(float)
    for i in range(N):
        gradient += (-1)*Y[i]*X[i]*(1-sigmoid(Y[i]*X[i].dot(beta)))
    gradient += 2*lyabdaVector.dot(beta)

    return gradient/float(N)
def gradient_descent(X, Y, epsilon=1e-6, l=1, step_size=1e-4, max_steps=1000):
    """
    Implement gradient descent using full value of the gradient.
    :param X: data matrix (2 dimensional np.array)
    :param Y: response variables (1 dimensional np.array)
    :param l: regularization parameter lambda
    :param epsilon: approximation strength
    :param max_steps: maximum number of iterations before algorithm will
        terminate.
    :return: value of beta (1 dimensional np.array)
    """
    X = X.astype(float)
    means = column_means(X)
    std = column_stdevs(X, means)
    standardize_dataset(X, means, std)

    lyabdaVector = [0]
    for i in range(1, len(std)):
        lyabdaVector.append((l)/((std[i]))**2)
    lyabdaVector = np.array(lyabdaVector)
    beta = np.random.random(X.shape[1])
    n = X.shape[0]
    arange = np.arange(n)
    np.random.shuffle(arange)

    for s in range(max_steps):
        # for each training sample, compute the gradient
        index = arange[(s)%n]

        gradient = normalized_gradient(X[index:index+1], Y[index:index+1], beta, lyabdaVector)      
        # update the beta_temp
        prevBeta = beta
        beta = beta - step_size * gradient
        dif_beta = beta - prevBeta
        step_size = step_size - 0.0000000000000000000001
        
        
        if norm(dif_beta)/norm(beta) < epsilon:
            print('Converged, iterations:simple gradient ', s, '!!!')
            break

    rescaleBeta(beta, means, std)
    return beta

def loss(X, Y, beta):
    """
        Compute loss function
    """
    return  sum([np.log(1 + np.exp(-Y[i]*X[i].dot(beta))) for i in range(X.shape[0])])

def logistic_predict(Xtrain, ytrain, Xtest, ytest):
    """
    Param: Xtrain for train SGD model
    Param: ytrain response vector for Xtrain
    Param: Xtest for train SGD model
    Param: ytrain response vector for Xtrain
    """
    
    Xtrain = np.array(Xtrain)
    ytrain = np.array(ytrain)
    Xtest = np.array(Xtest)
    ytest = np.array(ytest)
    
    one_s1 = np.ones(len(Xtrain))
    one_s2 = np.ones(len(Xtest))

    
    for row1, row2 in zip(Xtrain, Xtest):
        row1 = np.array(row1)
        row2 = np.array(row2)

    
    Xtrain = np.column_stack((one_s1, Xtrain))
    Xtest = np.column_stack((one_s2, Xtest))

   

    # normalize ytrain and ytest -->[-1,1]
    for i in range(len(ytrain)):
        if ytrain[i] == 0:
            ytrain[i] = -1
    
    for i in range(len(ytest)):
        if ytest[i] == 0:
            ytest[i] = -1

    beta = gradient_descent(Xtrain, ytrain, epsilon=1e-6, l=1, step_size=1e-2, max_steps=2500)
    responses = Xtest.dot(beta)
    Y = []
    for resp in responses:
        if resp > 0:
            Y.append(1)
        else:
            Y.append(0)

    return Y

In [5]:
def accuracy_score(Y_true, Y_predict):
    """
    Param: Y_true real labels
    Param : Y_predict predicted lables
    Process: Calculate accuracy_score
    """
    correct = 0
    for i in range(len(Y_true)):
        if Y_true[i] == Y_predict[i]:
            correct += 1
    return correct/(len(Y_true))

def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n = X.shape[0]
    
    k_folds = 10
    n = int(X.shape[0]/k_folds)* k_folds
    
    all_accuracies_tree = list()
    all_accuracies_randforest = list()
    all_accuracies_log = list()


    
    for trial in range(1):
        
        idx = np.arange(n)
        np.random.seed(trial)
        np.random.shuffle(idx)
        indexes = np.split(idx, k_folds)
        
        for i in range(k_folds):
            
            train_set = list(indexes)
            a = train_set[i]
            train_set.pop(i)
                
                
            Xtest = X[a]
            ytest = y[a]
               
            Xtrain = []
            ytrain = []
            for ff in train_set:
                for row1, row2 in zip(X[ff], y[ff]):
                    Xtrain.append(row1)
                    ytrain.append(row2)
                
            
            # train the decision tree
            classifier = DecisionTree(100)
            classifier.fit(Xtrain, ytrain)
            y_pred = classifier.predict(Xtest)
            accuracy1 = accuracy_score(ytest, y_pred)
            all_accuracies_tree.append(accuracy1)
            
            
            #train the random forest
            classifier1 = RandomForest(25, 50, 0.25)
            X_radfor = np.matrix(X)
            X_train_randfor = np.matrix(Xtrain)
            X_test_randfor = np.matrix(Xtest)
            Y_train_randfor = np.array(ytrain)
            Y_test_randfor = np.array(ytest)
            classifier1.fit(X_train_randfor, Y_train_randfor)
            y_pred2 = classifier1.predict(X_test_randfor)
            accuracy2 = accuracy_score(ytest, y_pred2[0])
            all_accuracies_randforest.append(accuracy2)


            # train by logostic regrresion
            y_pred = logistic_predict(Xtrain, ytrain, Xtest, ytest)
            accuracy3 = accuracy_score(ytest, y_pred)
            all_accuracies_log.append(accuracy3)


    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(all_accuracies_tree)
    stddevDecisionTreeAccuracy = np.std(all_accuracies_tree)
    
    meanRandomForestAccuracy = np.mean(all_accuracies_randforest)
    stddevRandomForestAccuracy = np.std(all_accuracies_randforest)
    
    
    
    meanLogisticRegressionAccuracy = np.mean(all_accuracies_log)
    stddevLogisticRegressionAccuracy = np.std(all_accuracies_log)
    

    # make certain that the return value matches the API specification
    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats


# Do not modify from HERE...
if __name__ == "__main__":
    stats = evaluate_performance()
    print("Decision Tree Accuracy_Mariam = ", stats[0, 0], " (", stats[0, 1], ")")
    print("Random Forest Tree Accuracy_Hayk = ", stats[1, 0], " (", stats[1, 1], ")")
    print("Logistic Reg. Accuracy_David = ", stats[2, 0], " (", stats[2, 1], ")")
# ...to HERE.

Decision Tree Accuracy_Mariam =  0.746153846154  ( 0.0624926031126 )
Random Forest Tree Accuracy_Hayk =  0.769230769231  ( 0.0516015687115 )
Logistic Reg. Accuracy_David =  0.823076923077  ( 0.0670599837468 )
