In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from collections import Counter
import random
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import LabelEncoder

In [None]:
def getData(dataframe):

    df = dataframe.sample(frac=1)    # Randomize data

    # split data into 2 sets -> Training Set (80% of total data), Test Set (20 % of total data)

    num_rows = df.shape[0]      # Get total number of rows
    total_splitter = int(num_rows * .20)    # Get 20% of total number of rows

    total_test, total_train = df[:total_splitter], df[total_splitter:]

    # Set features and labels for each set and convert to np arrays

    X_train = total_train.iloc[:, :-1].rename_axis('ID').values
    y_train = total_train.iloc[:, 30:].rename_axis('ID').values

    X_test = total_test.iloc[:, :-1].rename_axis('ID').values
    y_test = total_test.iloc[:, 30:].rename_axis('ID').values


    return X_train, y_train.flatten(), X_test, y_test.flatten()


In [None]:
data1 = pd.read_csv('project3_dataset1.txt', delimiter='\s+', header=None)
data1.head()

In [None]:
data1.describe()

In [None]:
#Check for null values
data1.isnull().sum()

In [None]:
data2 = pd.read_csv('project3_dataset2.txt', delimiter='\s+', header=None)
data2.head()

In [None]:
data2.describe()

In [None]:
#Check for null values
data2.isnull().sum()

## Nearest Neighbor

In [None]:
#formula
def euclideanDistance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))


class KNN:

    def __init__(self, k=4):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_predict = [self.helper(x) for x in X]
        
        return np.array(y_predict)

    # calc distance, sort, grab k neighbors, return the most frequent label
    def helper(self, x):
        all_distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_index = np.argsort(distances)[:self.k]
        kn_labels = [self.y_train[i] for i in k_index]  
        frequent = Counter(kn_labels).most_common(1)
        
        return frequent[0][0]

## Naive Bayes

In [None]:
class NaiveBayes:

    def fit(self, X, y):
        num_samples, num_feat = X.shape
        self.classes = np.unique(y)
        num_classes = len(self.classes)
        
        # get mean, variance and prior probabilities
        self.mean = np.zeros((num_classes, num_feat), dtype=np.float64)
        self.variance = np.zeros((num_classes, num_feat), dtype=np.float64)
        self.priors =  np.zeros(num_classes, dtype=np.float64)
        
        # calc the above 
        for index, cl in enumerate(self.classes):
            X_cl = X[y==cl]
            self.mean[index, :] = X_cl.mean(axis=0)
            self.variance[index, :] = X_cl.var(axis=0)
            self.priors[index] = X_cl.shape[0] / float(num_samples)

    def predict(self, X):
        y_predict = [self.helper(x) for x in X]
        return np.array(y_predict)

    # calc posterior prob, class conditional prob, and prior prob
    def helper(self, x):
        all_post = []
        
        # used formula in report to calculate these
        for index, cl in enumerate(self.classes):
            prior = np.log(self.priors[index])
            posterior = np.sum(np.log(self.prob_dense(index, x)))
            posterior = prior + posterior
            all_post.append(posterior)
            

        return self.classes[np.argmax(all_post)]
            

    # used formula in report
    def prob_dense(self, index, x):
        mean = self.mean[index]
        variance = self.variance[index]
        density = (np.exp(- (x-mean)**2 / (2 * variance)))/(np.sqrt(2 * np.pi * variance))
        
        return density

## Decision Tree

In [None]:
# Formula used in report
def entropy(y):
    count = np.bincount(y)
    prob = count / len(y)
    return -np.sum([p * np.log2(p) for p in prob if p > 0])

# Basic Node class
class Node:

    def __init__(self, feat=None, thresh=None, left=None, right=None, *, val=None):
        self.feat = feat
        self.thresh = thresh
        self.left = left
        self.right = right
        self.val = val

    def leaf(self):
        return self.val is not None


class DecisionTree:

    def __init__(self, min_split=2, maxdepth=100, num_feats=None):
        self.min_split = min_split
        self.maxdepth = maxdepth
        self.num_feats = num_feats
        self.root = None

    
    def fit(self, X, y):
        self.num_feats = X.shape[1] if not self.num_feats else min(self.num_feats, X.shape[1])
        self.root = self.grow(X, y)

    # make prediction
    def predict(self, X):
        return np.array([self.traverse(x, self.root) for x in X])

    # grows tree
    def grow(self, X, y, depth=0):
        num_sample, num_feat = X.shape
        num_label = len(np.unique(y))

        if (depth >= self.maxdepth
                or num_label == 1
                or num_sample < self.min_split):
            val_leaf = self.frequent(y)
            return Node(val=val_leaf)

        index_feats = np.random.choice(num_feat, self.num_feats, replace=False)

        greedy_feat, greedy_threshold = self.greediness(X, y, index_feats)
        
        left_sibs, right_sibs = self.splitter(X[:, greedy_feat], greedy_threshold)
        left_node = self.grow(X[left_sibs, :], y[left_sibs], depth+1)
        right_node = self.grow(X[right_sibs, :], y[right_sibs], depth+1)
        return Node(greedy_feat, greedy_threshold, left_node, right_node)

    # gets best gain
    def greediness(self, X, y, index_feats):
        current_best = -1
        index_splitter, threshold_splitter = None, None
        for index_feat in index_feats:
            X_column = X[:, index_feats]
            threshes = np.unique(X_column)
            for thresh in threshes:
                gain = self.gain(y, X_column, thresh)

                if gain > current_best:
                    current_best = gain
                    index_splitter = index_feats
                    threshold_splitter = thresh

        return index_splitter, threshold_splitter

    # calculates gain -> entropy of parent - weighted avg * entropy of child
    def gain(self, y, X_column, threshold_splitter):
        e_parent = entropy(y)
        left_child, right_child = self.splitter(X_column, threshold_splitter)
        if len(left_child) == 0 or len(right_child) == 0:
            return 0

        label_length = len(y)
        left_length, right_length = len(left_child), len(right_child)
        left_entropy, right_entropy = entropy(y[left_child]), entropy(y[right_child])
        e_children = (left_length / label_length) * left_entropy + (right_length / label_length) * right_entropy

        gain = e_parent - e_children
        return gain

    # returns all left children and right children
    def splitter(self, X_column, threshold_splitter):
        left_sibs = np.argwhere(X_column <= threshold_splitter).flatten()
        right_sibs = np.argwhere(X_column > threshold_splitter).flatten()
        return left_sibs, right_sibs

    # traverses through tree
    def traverse(self, x, node):
        if node.left():
            return node.val

        if x[node.feat] <= node.thresh:
            return self.traverse(x, node.left)
        return self.traverse(x, node.right)

    # return the most frequent label
    def frequent(self, y):
        count = Counter(y)
        frequent = count.most_common(1)[0][0]
        return frequent

## Random Forest 

In [None]:
def bootstrap(X, y):
    num_sample = X.shape[0]
    random_index = np.random.choice(num_sample, num_sample, replace=True)
    return X[random_index], y[random_index]

def frequent(y):
    count = Counter(y)
    frequency = count.most_common(1)[0][0]
    return frequency


class RandomForest:
    
    def __init__(self, num_trees=10, min_split=2,
                 maxdepth=100, num_feats=None):
        self.num_trees = num_trees
        self.min_split = min_split
        self.maxdepth = maxdepth
        self.num_feats = num_feats
        self.all_trees = []

    def fit(self, X, y):
        self.all_trees = []
        for _ in range(self.num_trees):
            single_tree = DecisionTree(min_split=self.min_split,
                maxdepth=self.maxdepth, num_feats=self.num_feats)
            X_samples, y_samples = bootstrap(X, y)
            single_tree.fit(X_samples, y_samples)
            self.all_trees.append(single_tree)

    def predict(self, X):
        predictiona = np.array([single_tree.predict(X) for tree in self.all_trees])
        predictiona = np.swapaxes(predictiona, 0, 1)
        y_predict = [frequent(predict) for predict in predictiona]
        return np.array(y_predict)

## Metric

In [None]:
from sklearn.model_selection import KFold

def KFold_CV(n, X, Y, model):
    kf = KFold(n_splits=n,shuffle=False)
    kf.split(X)    

    accuracy_model = []
    precision_model = []
    recall_model = []
    f1_model = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train) 
        predictions = model.predict(X_test)
        accuracy_model.append(accuracy_score(y_test, predictions, normalize=True)*100)
        precision_model.append(precision_score(y_test, predictions)*100)
        recall_model.append(recall_score(y_test, predictions)*100)
        f1_model.append(f1_score(y_test, predictions)*100)
        
    accuracy = np.mean(accuracy_model)
    precision = np.mean(precision_model)
    recall = np.mean(recall_model)
    f1score = np.mean(f1_model)
    
    return(accuracy, precision, recall, f1score)

### Dataset 1

In [None]:
y = data1.iloc[:,-1:].to_numpy()
y = y.flatten()
X = data1.drop(data1.columns[30], axis=1).to_numpy()



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
X_train, y_train, X_test, y_test = getData(data1)

In [None]:
#DECISION TREE

clf = DecisionTree(maxdepth=10)
clf.fit(X_train, y_train) 
predictions = clf.predict(Xtest)
print("Decision Tree accuracy: ", accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('Decision Tree Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('Decision Tree Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('Decision Tree F1 score: %f' % f1)

print('------------K-Fold Cross Validation Result---------------------')
clf = DecisionTree(maxdepth=10)
kfold_result = KFold_CV(10, X, Y, clf)
print("Decision Tree accuracy: ", kfold_result[0]) * .001
print('Decision Tree Precision: %f' % kfold_result[1])* .001
print('Decision Tree Recall: %f' % kfold_result[2])* .001
print('Decision Tree F1 score: %f' % kfold_result[3])* .001

In [None]:
#RANDOM FOREST

clf = RandomForest(num_trees=5, maxdepth=10)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print('Random Forest accuracy: ', accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('Random Forest Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('Random Forest Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('Random Forest F1 score: %f' % f1)

print('------------K-Fold Cross Validation Result---------------------')
clf = RandomForest(n_trees=5, max_depth=10)
kfold_result = KFold_CV(10, X, Y, clf)
print("Random Forest accuracy: ", kfold_result[0])
print('Random Forest Precision: %f' % kfold_result[1])
print('Random Forest Recall: %f' % kfold_result[2])
print('Random Forest F1 score: %f' % kfold_result[3])

In [None]:
#NAIVE BAYES
nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
print("Naive Bayes accuracy: ", accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('Naive Bayes Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('Naive Bayes Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('Naive Bayes F1 score: %f' % f1)

print('------------K-Fold Cross Validation Result---------------------')
nb = NaiveBayes()
kfold_result = KFold_CV(10, X, y, nb)
print("Naive Bayes accuracy: ", kfold_result[0])
print('Naive Bayes Precision: %f' % kfold_result[1])
print('Naive Bayes Recall: %f' % kfold_result[2])
print('Naive Bayes F1 score: %f' % kfold_result[3])

In [None]:
#KNN 

clf = KNN(k=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("KNN accuracy: ", accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('KNN Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('KNN Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('KNN F1 score: %f' % f1)


print('------------K-Fold Cross Validation Result---------------------')
clf = KNN(k=5)
kfold_result = KFold_CV(10, X, y, clf)
print("KNN accuracy: ", kfold_result[0])
print('KNN Precision: %f' % kfold_result[1])
print('KNN Recall: %f' % kfold_result[2])
print('KNN F1 score: %f' % kfold_result[3])

### Dataset 2

In [None]:
encoder = LabelEncoder()
data2[4] = encoder.fit_transform(data2[4])
y = data2.iloc[:,-1:].to_numpy()
y = y.flatten()
X = data2.drop(data2.columns[9], axis=1).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
#DECISION TREE

clf = DecisionTree(max_depth=10)
clf.fit(X_train, y_train) 
predictions = clf.predict(X_test)
print("Decision Tree accuracy: ", accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('Decision Tree Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('Decision Tree Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('Decision Tree F1 score: %f' % f1)


print('------------K-Fold Cross Validation Result---------------------')
clf = DecisionTree(max_depth=10)
kfold_result = KFold_CV(10, X, Y, clf)
print("Decision Tree accuracy: ", kfold_result[0])
print('Decision Tree Precision: %f' % kfold_result[1])
print('Decision Tree Recall: %f' % kfold_result[2])
print('Decision Tree F1 score: %f' % kfold_result[3])

In [None]:
#RANDOM FOREST

clf = RandomForest(n_trees=5, max_depth=10)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print('Random Forest accuracy: ', accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('Random Forest Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('Random Forest Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('Random Forest F1 score: %f' % f1)


print('------------K-Fold Cross Validation Result---------------------')
clf = RandomForest(n_trees=5, max_depth=10)
kfold_result = KFold_CV(10, X, Y, clf)
print("Random Forest accuracy: ", kfold_result[0])
print('Random Forest Precision: %f' % kfold_result[1])
print('Random Forest Recall: %f' % kfold_result[2])
print('Random Forest F1 score: %f' % kfold_result[3])

In [None]:
#NAIVE BAYES
nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
print("Naive Bayes accuracy: ", accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('Naive Bayes Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('Naive Bayes Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('Naive Bayes F1 score: %f' % f1)

print('------------K-Fold Cross Validation Result---------------------')
nb = NaiveBayes()
kfold_result = KFold_CV(10, X, Y, nb)
print("Naive Bayes accuracy: ", kfold_result[0])
print('Naive Bayes Precision: %f' % kfold_result[1])
print('Naive Bayes Recall: %f' % kfold_result[2])
print('Naive Bayes F1 score: %f' % kfold_result[3])

In [None]:
#KNN 

clf = KNN(k=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("KNN accuracy: ", accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('KNN Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('KNN Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('KNN F1 score: %f' % f1)



print('------------K-Fold Cross Validation Result---------------------')
clf = KNN(k=5)
kfold_result = KFold_CV(10, X, Y, clf)
print("KNN accuracy: ", kfold_result[0])
print('KNN Precision: %f' % kfold_result[1])
print('KNN Recall: %f' % kfold_result[2])
print('KNN F1 score: %f' % kfold_result[3])

In [None]:
demo_train = pd.read_csv('project3_dataset3_train.txt', delimiter='\s+', header=None)
from sklearn.model_selection import train_test_split

y = demo_train.iloc[:,-1:].to_numpy()
y = y.flatten()
X = demo_train.drop(demo_train.columns[4], axis=1).to_numpy()

X_train, y_train = train_test_split(demo_train)

In [None]:
demo_test = pd.read_csv('project3_dataset3_test.txt', delimiter='\s+', header=None)

y1 = demo_test.iloc[:,-1:].to_numpy()
y1 = y1.flatten()
X1 = demo_test.drop(demo_test.columns[4], axis=1).to_numpy()

X_test, y_test = train_test_split(demo_train)

In [None]:
clf = KNN(k=3)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("KNN accuracy: ", accuracy_score(y_test, predictions))
precision = precision_score(y_test, predictions)
print('KNN Precision: %f' % precision)
recall = recall_score(y_test, predictions)
print('KNN Recall: %f' % recall)
f1 = f1_score(y_test, predictions)
print('KNN F1 score: %f' % f1)