In [8]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import matplotlib.pyplot as plt

In [9]:
class DecisionNode(object):

    def __init__(self,
                 column=None,
                 value=None,
                 false_branch=None,
                 true_branch=None,
                 current_results=None,
                 is_leaf=False,
                 results=None):
        self.column = column
        self.value = value
        self.false_branch = false_branch
        self.true_branch = true_branch
        self.current_results = current_results
        self.is_leaf = is_leaf
        self.results = results

def dict_of_values(data):

    results = defaultdict(int)
    for row in data:
        r = row[len(row) - 1]
        results[r] += 1
    return dict(results)


def divide_data(data, feature_column, feature_val):
    """
    
    :param data: 
    :param feature_column: 
    :param feature_val: 
    :return: divide data into 2 parts, return first and second parts
    """
    data1 = []
    data2 = []
    if type(feature_val) == int or type(feature_val) == float:
        for i in data:
            if i[feature_column] >= feature_val:
                data1.append(i)
            else:
                data2.append(i)
    else:
        for i in data:
            if i[feature_column] == feature_val:
                data1.append(i)
            else:
                data2.append(i)
    return data1, data2


def gini_impurity(data1, data2):
    """
    :param data1: 
    :param data2: 
    :return: loss function Gini
    """
    left_part = dict_of_values(data1)
    right_part = dict_of_values(data2)

    NL = len(data1)
    NR = len(data2)

    sum_left = 0
    for i in left_part:
        sum_left += (left_part[i] / NL * (1 - left_part[i] / NL))

    sum_right = 0
    for i in right_part:
        sum_right += (right_part[i] / NR * (1 - right_part[i] / NR))

    Gini = NL * sum_left + NR * sum_right

    return Gini

def build_tree(data, current_depth=0, max_depth=1e10):
    """
    
    :param data: 
    :param current_depth: 
    :param max_depth: 
    :return: nothing
    """
    if len(data) == 0:
        return DecisionNode(is_leaf=True)

    if (current_depth == max_depth):
        return DecisionNode(current_results=dict_of_values(data))

    if (len(dict_of_values(data)) == 1):
        return DecisionNode(current_results=dict_of_values(data), is_leaf=True)

    self_gini = gini_impurity(data, [])

    best_gini = 1e10
    best_column = None
    best_value = None

    best_split = None
    temp_gini = best_gini
    for i in range(len(data) - 1):
        for j in range(len(data[i]) - 1):
            data1_2 = divide_data(data, j, data[i][j])
            if temp_gini > gini_impurity(data1_2[0], data1_2[1]):
                temp_gini = gini_impurity(data1_2[0], data1_2[1])
                best_column = j
                best_value = data[i][j]
                best_split = data1_2[:]

    if abs(self_gini - best_gini) < 1e-10:
        return DecisionNode(current_results=dict_of_values(data), is_leaf=True)
    else:
        return DecisionNode(current_results=dict_of_values(data), column=best_column, value=best_value,
                            true_branch=build_tree(best_split[0], current_depth + 1, max_depth),
                            false_branch=build_tree(best_split[1], current_depth + 1, max_depth))


def print_tree(tree, indent=''):
    """
    
    :param tree: 
    :param indent: 
    :return: nothing 
    """
    if tree.is_leaf:
        print(str(tree.current_results))
    else:
        #         print (indent+'Current Results: ' + str(tree.current_results))
        print('Column ' + str(tree.column) + ' : ' + str(tree.value) + '? ')

        # Print the branches
        print(indent + 'True->', end="")
        print_tree(tree.true_branch, indent + '  ')
        print(indent + 'False->', end="")
        print_tree(tree.false_branch, indent + '  ')

In [10]:
def argmaxx(dict):
    """
    :param dict: 
    :return: argument, index of max value
    """
    maxx = None
    maxxarg = None
    dummy = 0
    for i in dict:
        if dummy == 0:
            dummy += 1
            maxx = dict[i]
            maxxarg = i
        elif dict[i] > maxx:
            maxx = dict[i]
            maxxarg = i
    return maxxarg

class DecisionTree(object):

    def __init__(self, max_tree_depth):
        self.max_depth = max_tree_depth

    def fit(self, X, Y):
        """
        :param X: 
        :param Y: 
        :return: build tree, return tree 
        """
        data = X[:]
        for i in range(len(data)):
            data[i].append(Y[i])
        self.tree = build_tree(data, 0, self.max_depth)
        return self.tree

    def predict(self, X):
        """
        :param X: 
        :return: Y being 1 dimension python
        list with labels
        """
        Y = []
        for j in range(len(X)):
            node = self.tree
            for i in range(self.max_depth):
                if node.is_leaf == True:
                    break

                if type(X[j][node.column]) == int or type(X[j][node.column]) == float:
                    if X[j][node.column] >= node.value:
                        node = node.true_branch
                    else:
                        node = node.false_branch
                else:
                    if X[j][node.column] == node.value:
                        node = node.true_branch
                    else:
                        node = node.false_branch

            Y.append(argmaxx(node.current_results))
        return np.array(Y)

    def print(self):
        print_tree(self.tree)

In [11]:
def sigmoid(s):
    return np.exp(s) / ( 1. + np.exp(s))


def normalized_data(X):
    """
    :param X: 
    :return: normalized data and means, maxes, mines of unnormalize data
    """
    maxes = np.array([np.max(X[:,i]) for i in range(X.shape[1])])
    mines = np.array([np.min(X[:,i]) for i in range(X.shape[1])])
    means = np.array([np.mean(X[:,i]) for i in range(X.shape[1])])
    for i in range(X.shape[1]):
        if maxes[i] != mines[i]:
            X[:,i] = (X[:,i] - means[i]) / (maxes[i] - mines[i])
    return X, means, maxes, mines

def P(y,x,beta):
    return sigmoid(y * (x.dot(beta.T)))


def gradient_descent(X, Y, epsilon=1e-6, l=1, step_size=1e-1, max_steps=1000):
    """
    :param X: 
    :param Y: 
    :param epsilon: 
    :param l: 
    :param step_size: 
    :param max_steps: 
    :return: beta, max_steps 
    """
    beta = np.random.normal(0, 10, X.shape[1])
    # beta = np.zeros(X.shape[1])
    gradient_naxord = np.zeros(len(beta))
    N = X.shape[0]
    for s in range(max_steps):

        gradient = np.zeros(len(beta))
        if s % 10 == 0:
            print(s, beta)
        for j in range(X.shape[1]):
            for i in range(X.shape[0]):
                # print(X[i,j])
                # (1 - P(Y[i], X[i], beta))
                gradient[j] += Y[i] * X[i, j] * (1 - P(Y[i], X[i], beta))
        # print("gradient")
        # print(beta)
        for j in range(X.shape[1] - 1):
            # print(beta)
            # print(beta[j+1])
            # print("miban")
            # print((step_size) * (gradient[j+1]) - (step_size) * (l * beta[j+1]))
            beta[j + 1] = beta[j + 1] - (step_size) * (gradient[j + 1]) - (step_size) * (l * beta[j + 1])
        beta[0] -= (step_size) * (gradient[0])

    return beta, max_steps


def logistic_predict(beta, X):
    distance = X.dot(beta.T)
    #print(distance.shape)
    Y = []
    for i in range(distance.shape[1]):
        if sigmoid(distance[0,i]) >= 0.5:
            Y.append(1)
        else:
            Y.append(-1)
    return np.array(Y)

In [12]:
class RandomForest(object):
    """
    RandomForest a class, that represents Random Forests.

    :param num_trees: Number of trees in the random forest
    :param max_tree_depth: maximum depth for each of the trees in the forest.
    :param ratio_per_tree: ratio of points to use to train each of
        the trees.
    """
    def __init__(self, num_trees, max_tree_depth, ratio_per_tree=0.5):
        self.num_trees = num_trees
        self.max_tree_depth = max_tree_depth
        self.trees = None

    def fit(self, X, Y):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :param Y: 1 dimensional python list or numpy 1 dimensional array
        """
        self.trees = []
        for i in range(self.num_trees):
            idx = np.arange(X.shape[0])
            np.random.shuffle(idx)
            X_train = X[idx]
            Y_train = Y[idx]
            temp = DecisionTree(self.max_tree_depth)
            temp.fit(X_train.tolist(), Y_train)
            self.trees.append(temp)
        return self.trees

    def predict(self, X):
        """
        :param X: 2 dimensional python list or numpy 2 dimensional array
        :return: (Y, conf), tuple with Y being 1 dimension python
        list with labels, and conf being 1 dimensional list with
        confidences for each of the labels.
        """
        predicts = []
        Y = []
        for i in range(self.num_trees):
            predicts.append(self.trees[i].predict(X))

        for i in range(len(predicts[0])):
            max = predicts[0][i]
            for j in range(len(predicts)):
                if max < predicts[j][i]:
                    max = predicts[j][i]
            Y.append(max)
        conf = []
        for i in range(len(predicts[0])):
            q = 0.0
            for j in range(len(predicts)):
                if Y[i] == predicts[j][i]:
                    q += 1
            conf.append(q / len(predicts[0]))


        return (Y, conf)

In [14]:
def accuracy_score(Y_true, Y_predict):
    true_predicts = 0
    for i in range(len(Y_true)):
        if Y_true[i] == Y_predict[i]:
            true_predicts += 1
    return true_predicts / len(Y_true)

def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    #print("exav")
    filename = 'SPECTF.dat'
    df = pd.read_csv(filename)
    Y = list(np.array(df.iloc[:, 0]))
    X = np.matrix(df.iloc[:, 1:])
    X = X.tolist()

    X = np.matrix(X)
    Y = np.array(Y)
    Y_logistic = Y.copy()
    for i in range(len(Y_logistic)):
        if Y_logistic[i] == 0:
            Y_logistic[i] = -1
    n, d = X.shape
    tree_accuracies = []
    logistic_accuracies = []
    for_accuracies = []
    for trial in range(3):

        idx = np.arange(n)
        #np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        Y = Y[idx]
        Y_logistic = Y_logistic[idx]

        cross_val_n = int(n * 0.1)
        train_n = n - cross_val_n
        Xtrain = X[0:train_n, :]
        Xtest = X[train_n:, :]
        ytrain = Y[0:train_n]
        ytest = Y[train_n:]


        # train the decision tree
        classifier = DecisionTree(100)
        classifier.fit(Xtrain.tolist(), ytrain)
        y_pred = classifier.predict(Xtest.tolist())
        accuracy = accuracy_score(ytest, y_pred)
        tree_accuracies.append(accuracy)


        ForestClassifier = RandomForest(10, 100)
        ForestClassifier.fit(Xtrain, ytrain)
        yPredForest = ForestClassifier.predict(Xtest.tolist())[0]
        forest_accuracy = accuracy_score(ytest, yPredForest)
        for_accuracies.append(forest_accuracy)

        XNormalize = normalized_data(X)[0]
        XtrainNorm = XNormalize[0:train_n, :]
        XtestNorm = XNormalize[train_n:, :]
        Y_logistic_train = Y_logistic[0:train_n]
        Y_logistic_test = Y_logistic[train_n:]
        beta = gradient_descent(XtrainNorm, Y_logistic_train, epsilon=1e-6, l=1, step_size=1e-1, max_steps=100)[0]
        ypred_logistic = logistic_predict(beta, XtestNorm)
        log_accuracy = accuracy_score(Y_logistic_test, ypred_logistic)
        logistic_accuracies.append(log_accuracy)


        print("accuracy = ", accuracy)
        print("logistic_accuracies = ", logistic_accuracies)
        print("for_accuracies = ", for_accuracies)
        #print("accuracy_score(ytest, y_pred) = ", accuracy_score(ytest, y_pred))
        #print("accuracy_score(Y_logistic_test, ypred_logistic) = ", accuracy_score(Y_logistic_test, ypred_logistic))
        #print("ytest = ", ytest)
        #print("y_pred = ", y_pred)
        #print("Y_logistic_test = ", Y_logistic_test)
        #print("ypred_logistic = ", ypred_logistic)

        #break
    print("tree_accuracies = ", tree_accuracies)
    print("logistic_accuracies = ", logistic_accuracies)
    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(tree_accuracies)
    stddevDecisionTreeAccuracy = np.std(tree_accuracies)
    meanLogisticRegressionAccuracy = np.mean(logistic_accuracies)
    stddevLogisticRegressionAccuracy = np.std(logistic_accuracies)
    meanRandomForestAccuracy = np.mean(for_accuracies)
    stddevRandomForestAccuracy = np.std(for_accuracies)

    # make certain that the return value matches the API specification
    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats


# Do not modify from HERE...
if __name__ == "__main__":
    #print("exav1")
    stats = evaluate_performance()
    #print("exav2")
    print("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")
    print("Random Forest Tree Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
    print("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")
# ...to HERE.

KeyboardInterrupt: 