In [None]:
# IMPORT LIBRARIES
import numpy as np
import matplotlib.pyplot as plt
from scipy import io
import pandas as pd
from sklearn.metrics import accuracy_score
from scipy import stats
import numpy as np
from scipy import stats
from sklearn.base import BaseEstimator, ClassifierMixin
import random

# A code snippet to help you save your results into a kaggle accepted csv. COPIED FROM HW1 STARTER CODE.
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1  # Ensures that the index starts at 1.
    df.to_csv('submission.csv', index_label='Id')

random.seed(189)
np.random.seed(189)

In [None]:
# THE FOLLOWING CELL IS FOR LOADING THE TITANIC AND SPAM DATASETS INTO A 2D ARRAY.

datasets_data = {}

def load(path,mat=False,train=False):
    if mat:
        return io.loadmat(path)
    else:
        d = {}
        data = np.genfromtxt(path, delimiter=',', dtype=None,encoding=None)
        d["field_names"] = data[0]
        if train:
            d["training_labels"] = data[1:,0]
        arr = np.array([])
        for i in range(data.shape[0]):
            if train:
                if i > 0:
                    arr = np.append(arr,data[i,1:])
            else:
                if i > 0:
                    arr = np.append(arr,data[i,])
        if train:
            d["training_data"] = arr.reshape(i,len(data[0])-1)
        else:
            d["test_data"] = arr.reshape(i,len(data[0]))
        return d

datasets_data["spam"] = load("datasets/spam_data/spam_data.mat",mat=True)
datasets_data["titanic_train"] = load("datasets/titanic/titanic_training.csv",train=True)
datasets_data["titanic_test"] = load("datasets/titanic/titanic_testing_data.csv")

In [None]:
# RANDOMIZE THE DATASETS

p = np.random.permutation(len(datasets_data["spam"]["training_data"]))
spam_train_data = datasets_data["spam"]["training_data"][p]
spam_train_labels = datasets_data["spam"]["training_labels"][p]
spam_test_data = datasets_data["spam"]["test_data"]
p = np.random.permutation(len(datasets_data["titanic_train"]["training_data"]))
titanic_train_data = datasets_data["titanic_train"]["training_data"][p]
titanic_train_labels = datasets_data["titanic_train"]["training_labels"][p]
titanic_test_data = datasets_data["titanic_test"]["test_data"]

In [None]:
# IMPUTE VALUES FOR MISSING VALUES IN TITANIC LABELS (FOR TRAINING SET) AND DATA (FOR BOTH TEST AND TRAINING SETS) WITH THE MODE VALUE. 

for j in range(len(titanic_train_data[0])):
    mode = stats.mode([i for i in titanic_train_data[:,j] if i != ""])[0][0]
    arr = titanic_train_data[:,j]
    for i,c in enumerate(arr):
        if c == "":
            arr[i] = mode
    titanic_train_data[:,j] = arr

for j in range(len(titanic_test_data[0])):
    mode = stats.mode([i for i in titanic_test_data[:,j] if i != ""])[0][0]
    arr = titanic_test_data[:,j]
    for i,c in enumerate(arr):
        if c == "":
            arr[i] = mode
    titanic_test_data[:,j] = arr

mode = stats.mode([i for i in titanic_train_labels if i != ""])[0][0]
for i,c in enumerate(titanic_train_labels):
    if c == "":
        titanic_train_labels[i] = mode

In [None]:
# ONE HOT ENCODE PCLASS, SEX, AND EMBARKED FIELDS.

categorical_features = [0,0,6] # COLUMN INDEXES FOR THE FIELDS MENTIONED ABOVE (AFTER REMOVING SEQUENTIALLY).
for j in categorical_features:
    arr = list(set(titanic_train_data[:,j]))
    for item in arr:
        im_arr = (titanic_train_data[:,j] == item).astype('int')
        titanic_train_data = np.append(titanic_train_data,np.reshape(im_arr,(titanic_train_data.shape[0],1)),axis=1)
    titanic_train_data = np.delete(titanic_train_data,j,axis=1)

for j in categorical_features:
    arr = list(set(titanic_test_data[:,j]))
    for item in arr:
        im_arr = (titanic_test_data[:,j] == item).astype('int')
        titanic_test_data = np.append(titanic_test_data,np.reshape(im_arr,(titanic_test_data.shape[0],1)),axis=1)
    titanic_test_data = np.delete(titanic_test_data,j,axis=1)


# REMOVE ORIGINAL COLUMNS AFTER BEING DONE WITH ONE-HOT ENCODING.
titanic_test_data = np.delete(titanic_test_data,3,axis=1)
titanic_test_data = np.delete(titanic_test_data,4,axis=1)
titanic_train_data = np.delete(titanic_train_data,3,axis=1)
titanic_train_data = np.delete(titanic_train_data,4,axis=1)

# 3.1

In [None]:
eps = 1e-5

def w(x):
    return np.int(hash(x)) % 1000

h = np.vectorize(w)


class DecisionTree:
    def __init__(self, max_depth=5, feature_labels=None):
        self.max_depth = max_depth
        self.features = feature_labels
        self.split_rule = [None,None]
        self.left = None
        self.right = None
        self.label = None

    @staticmethod
    def information_gain(X, y, thresh):
        return DecisionTree.entropy(y) - (len(X[np.where(X>=thresh)[0]])*DecisionTree.entropy(y[np.where(X>=thresh)[0]])+len(X[np.where(X<thresh)[0]])*DecisionTree.entropy(y[np.where(X<thresh)[0]]))/len(X)

    @staticmethod
    def entropy(y):
        entropy = 0
        for i in np.unique(y):
            fraction = np.count_nonzero(y==i)/len(y)
            entropy -= fraction*np.log2(fraction)
        return entropy

    def split(self, X, y, idx, thresh):
        left_idx = np.where(X[:,idx]<thresh)[0]
        right_idx = np.where(X[:,idx]>=thresh)[0]
        return X[left_idx], y[left_idx], X[right_idx], y[right_idx]

    def fit(self, X, y):
        if self.max_depth == 0:
            self.label = stats.mode(y).mode[0]
        else:
            width = X.shape[1]
            thresholds = [np.linspace(min(X[:,i])+eps,max(X[:,i])-eps, num=10) for i in range(width)]
            ig_arr = []
            for i in range(width):
                ig_arr.append([self.information_gain(X[:,i],y,threshold) for threshold in thresholds[i]])
            ig_arr = np.array(ig_arr)
            self.split_rule[0], idx = np.unravel_index(np.argmax(ig_arr), ig_arr.shape)
            thresholds = np.array(thresholds)
            self.split_rule[1] = thresholds[self.split_rule[0],idx]
            left_X, left_y, right_X, right_y = self.split(X, y, self.split_rule[0], self.split_rule[1])
            if left_X.shape[0] == 0 or right_X.shape[0] == 0:
                self.max_depth = 0
                self.label = stats.mode(y).mode[0]
            else:
                self.left = DecisionTree(self.max_depth-1, self.features)
                self.right = DecisionTree(self.max_depth-1, self.features)
                self.left.fit(left_X, left_y)
                self.right.fit(right_X, right_y)

    def predict(self, X, split = False):
        if self.max_depth == 0:
            return np.array([1]*X.shape[0])*self.label
        else:
            left_idx = np.where(X[:,self.split_rule[0]]<self.split_rule[1])[0]
            if split and len(X) == 1:
                    if len(X[left_idx]) == 1:
                        print("("+str(self.features[self.split_rule[0]])+")"+" < "+str(self.split_rule[1]))
                    else:
                        print("("+str(self.features[self.split_rule[0]])+")"+" >= "+str(self.split_rule[1]))
            ret_pred = np.array([0]*X.shape[0])
            ret_pred[left_idx] = self.left.predict(X[left_idx],split)
            right_idx = np.where(X[:,self.split_rule[0]]>=self.split_rule[1])[0]
            ret_pred[right_idx] = self.right.predict(X[right_idx],split)
            return ret_pred

# 3.2

In [None]:
class BaggedTrees(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=3, n=20, cut=100):
        self.max_depth = max_depth
        self.n = n
        self.cut = cut
        self.decision_trees = [
            DecisionTree(max_depth, None, None)
            for i in range(self.n)
        ]

    def fit(self, X, y):
        for model in self.decision_trees:
            idx = np.random.randint(0, X.shape[0], self.cut)
            model.fit(X[idx], y[idx])

    def predict(self, X):
        return np.array(np.round(np.mean([model.predict(X) for model in self.decision_trees],axis=0))).astype(np.bool_)

class RandomForest(BaggedTrees):
    def __init__(self, max_depth=3, n=20, cut=100):
        self.max_depth = max_depth
        self.n = n
        self.cut = cut
        self.decision_trees = [
            DecisionTree(max_depth, None)
            for i in range(self.n)
        ]

# 3.3

1. For categorical variables, I used one-hot encoding on the Pclass, Sex, and Embarked columns to turn these categorical variables into numeric ones of the same weight. Moreover, since one-hot encoding Ticket and Cabin would not produce any useful information due to its uniqueness, I decided to remove it. Lastly, regarding the missing data, I used the mode value of each column to fill in the missing information required for the training datasets.

2. I used the max depth as my stopping criterion, where I would return the most common class available as the label when reached.

3. I used a list of decision trees where from some random subset of the features, every node would choose a splitting feature that would then be used to deduce the returned prediction via majority vote in the total amount of trees implemented.

4. I did not do anything regarding the speedup.

5. I really haven't implemented anything else that has been cool, sadly :(

# 3.4

Kaggle Username: Abel Yagubyan

Spam score: 78.623%

Titanic score: 80.645%

In [None]:
features = ["pain", "private", "bank", "money", "drug", "spam", "prescription",
            "creative", "height", "featured", "differ", "width", "other",
            "energy", "business", "message", "volumes", "revision", "path",
            "meter", "memo", "planning", "pleased", "record", "out",
            "semicolon", "dollar", "sharp", "exclamation", "parenthesis",
            "square_bracket", "ampersand"]
idx = int(0.8*spam_train_data.shape[0])
model = DecisionTree(max_depth=5, feature_labels=features)
model.fit(spam_train_data[:idx], spam_train_labels[:idx])
print("Training Accuracy for spam in decision tree: "+str(accuracy_score(model.predict(spam_train_data[:idx]),spam_train_labels[:idx])))
print("Validation Accuracy for spam in decision tree: " + str(accuracy_score(model.predict(spam_train_data[idx:]),spam_train_labels[idx:])))

In [None]:
model = RandomForest(max_depth=5, n=50, cut=len(spam_train_data))
model.fit(spam_train_data[:idx], spam_train_labels[:idx])
print("Training Accuracy for spam in random forest: "+str(accuracy_score(model.predict(spam_train_data[:idx]),spam_train_labels[:idx])))
print("Validation Accuracy for spam in random forest: " + str(accuracy_score(model.predict(spam_train_data[idx:]),spam_train_labels[idx:])))

In [None]:
titanic_train_data = titanic_train_data.astype(float)
titanic_test_data = titanic_test_data.astype(float)
titanic_train_labels = titanic_train_labels.astype(np.int64)
features = ['x'+str(i) for i in range(12)]
idx = int(0.8*titanic_train_data.shape[0])
model = DecisionTree(max_depth=5, feature_labels=features)
model.fit(titanic_train_data[:idx], titanic_train_labels[:idx])
print("Training Accuracy for titanic in decision tree: "+str(accuracy_score(model.predict(titanic_train_data[:idx]),titanic_train_labels[:idx])))
print("Validation Accuracy for titanic in decision tree: " + str(accuracy_score(model.predict(titanic_train_data[idx:]),titanic_train_labels[idx:])))

In [None]:
model = RandomForest(max_depth=5, n=50, cut=len(titanic_train_data))
model.fit(titanic_train_data[:idx], titanic_train_labels[:idx])
print("Training Accuracy for titanic in random forest: "+str(accuracy_score(model.predict(titanic_train_data[:idx]),titanic_train_labels[:idx])))
print("Validation Accuracy for titanic in random forest: " + str(accuracy_score(model.predict(titanic_train_data[idx:]),titanic_train_labels[idx:])))

In [None]:
# Kaggle submission for titanic
model = RandomForest(max_depth=10, n=100, cut=len(titanic_train_data))
model.fit(titanic_train_data, titanic_train_labels)
results_to_csv(model.predict(titanic_test_data))

In [None]:
# Kaggle submission for spam
model = RandomForest(max_depth=15, n=100,cut=len(spam_train_data))
model.fit(spam_train_data, spam_train_labels)
results_to_csv(model.predict(spam_test_data))

# 3.5

Subpart 2

In [None]:
features = ["pain", "private", "bank", "money", "drug", "spam", "prescription",
            "creative", "height", "featured", "differ", "width", "other",
            "energy", "business", "message", "volumes", "revision", "path",
            "meter", "memo", "planning", "pleased", "record", "out",
            "semicolon", "dollar", "sharp", "exclamation", "parenthesis",
            "square_bracket", "ampersand"]
model = DecisionTree(max_depth=25, feature_labels=features)
model.fit(spam_train_data, spam_train_labels)
predictions = model.predict(spam_train_data)

ham_point = spam_train_data[(predictions == 0).nonzero()[0][0]]
spam_point = spam_train_data[(predictions == 1).nonzero()[0][0]]

In [None]:
model.predict(np.array([ham_point]), split=True)
print("Therefore this email was ham.")

In [None]:
model.predict(np.array([spam_point]), split=True)
print("Therefore this email was spam.")

Subpart 3

In [None]:

features = ["pain", "private", "bank", "money", "drug", "spam", "prescription",
            "creative", "height", "featured", "differ", "width", "other",
            "energy", "business", "message", "volumes", "revision", "path",
            "meter", "memo", "planning", "pleased", "record", "out",
            "semicolon", "dollar", "sharp", "exclamation", "parenthesis",
            "square_bracket", "ampersand"]
idx = int(0.8*spam_train_data.shape[0])
max_depths = [i for i in range(1,41)]
accuracies = []

for i in max_depths:
    model = DecisionTree(max_depth=i, feature_labels=features)
    model.fit(spam_train_data[:idx], spam_train_labels[:idx])
    accuracies.append(accuracy_score(model.predict(spam_train_data[idx:]),spam_train_labels[idx:]))

plt.plot(max_depths,accuracies,'r-')
plt.xlabel("Max batch size")
plt.ylabel("Validation accuracy")
plt.title("Max batch size vs Validation accuracy")

The highest validation accuracy was at a depth of approximately 25 at 81%, however it appears that the general accuracy seems to be almost equivalent starting from a depth of 10 and onwards, hence due to the randomness of the input data, 25 is not exactly the optimal for all datasets (although it is for our particular dataset when shuffled with the particular seed of 189).

# 3.6

In [None]:
features = ["pain", "private", "bank", "money", "drug", "spam", "prescription",
            "creative", "height", "featured", "differ", "width", "other",
            "energy", "business", "message", "volumes", "revision", "path",
            "meter", "memo", "planning", "pleased", "record", "out",
            "semicolon", "dollar", "sharp", "exclamation", "parenthesis",
            "square_bracket", "ampersand"]
model = DecisionTree(max_depth=3, feature_labels=features)
model.fit(spam_train_data, spam_train_labels)

In [None]:
model.features