In [1]:
import numpy as np
import pandas as pd
import random
import logging
import csv

In [2]:
class DTreeNode:
    def __init__(self, feature, threshold):
        self.feature = feature
        self.threshold = threshold
        self.left = None
        self.right = None

    def predict(self, x):
        if x[self.feature] < self.threshold:
            return self.left.predict(x)
        else:
            return self.right.predict(x)

    def _is_initialized(self):
        return self.left and self.right


In [3]:
class DTreeLeaf:
    def __init__(self, y):
        self.y = y

    def predict(self, _):
        return self.y


In [4]:
class DTreeForest:
    def __init__(self, dTrees):
        self.dTrees = dTrees

    def predict(self, x):
        y_pred = [dt.predict(x) for dt in self.dTrees]
        most_frequent_y = np.argmax(np.bincount(y_pred))
        return most_frequent_y


In [5]:
def findEntropy(x):
    prob = np.bincount(x) / len(x)
    prob = prob[prob > 0]
    return -np.sum(prob * np.log2(prob))

In [6]:
def buildDTree(x, y, n_features_sampled=None):
    best_information_gain, node_data = 0, None
    n_samples, n_features = x.shape
    H_before_split = findEntropy(y)

    if n_features_sampled:
        features = np.random.choice(n_features,
                                    min(n_features, max(1, n_features_sampled)),
                                    replace=False)
    else:
        features = np.arange(n_features)

    for feature in features:
        x_feature = x[:, feature]
        threshold = np.mean(x_feature)

        left_idx = x_feature < threshold
        right_idx = x_feature >= threshold

        y_left = y[left_idx]
        y_right = y[right_idx]
        p_y_left = len(y_left) / n_samples
        p_y_right = len(y_right) / n_samples

        H_after_split = p_y_left * findEntropy(y_left) + p_y_right * findEntropy(y_right)
        information_gain = H_before_split - H_after_split

        if information_gain > best_information_gain:
            best_information_gain = information_gain
            node_data = feature, threshold, left_idx, y_left, right_idx, y_right

    if not best_information_gain:
        most_frequent_y = np.argmax(np.bincount(y))
        return DTreeLeaf(most_frequent_y)
    else:
        feature, threshold, left_idx, y_left, right_idx, y_right = node_data
        node = DTreeNode(feature, threshold)
        node.left = buildDTree(x[left_idx], y_left, n_features_sampled)
        node.right = buildDTree(x[right_idx], y_right, n_features_sampled)
        return node


In [7]:
def shuffle(a, b):
    # if len(a) != len(b):
    #    raise ValueError('array lengths do not match')
    idx = np.random.permutation(len(a))
    return a[idx], b[idx]

In [13]:
def findAccuracy(y_true, y_pred):
    # if y_true.shape != y_pred.shape:
    #     raise ValueError('array shapes do not match')
    return np.sum(np.equal(y_true, y_pred)) / len(y_true)


In [14]:
np.random.seed()         

df = np.array(pd.read_csv('data2.csv', header=None))

x, y = df[:, :-1], df[:, -1].astype(np.bool_)
x, y = shuffle(x, y)

split = len(x) // 2

x_train, y_train = x[:split], y[:split]         
x_val, y_val = x[split:], y[split:]


_, n_features = x.shape
n_features_sampled = int(np.sqrt(n_features))
forest_size = 25
dTrees = []

for tree_idx in range(forest_size):
    sampled_idx = np.random.randint(0, high=split, size=split)
    x_bootstrap, y_bootstrap = x_train[sampled_idx], y_train[sampled_idx]
    dTree = buildDTree(x_bootstrap, y_bootstrap, n_features_sampled=n_features_sampled)
    dTrees.append(dTree)
    
dForest = DTreeForest(dTrees)


In [15]:
df2 = np.array(pd.read_csv('test2.csv', header=None))

x_test=df2[:].astype(np.bool_)
y_test=np.zeros(len(x_test))
for i in range(len(x_test)):
    y_test[i] = int(dForest.predict(x_test[i]))
y_test

array([1., 0., 1., 1.])

In [16]:
with open("sd4175_extracredit.out", "w") as op:
    write_buffer = csv.writer(op, delimiter=" ")

    write_buffer.writerow(y_test)

