In [157]:
import math
from scipy.stats import norm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score, confusion_matrix
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler

In [158]:
dataset = pd.read_csv("data/magic04.data")

In [159]:
def ready_data(dataset, label_encode=True, resample=True, split=True):
    inputs = np.array(dataset.values[:, 0:-1], dtype=np.float32)
    temp_target = dataset.values[:, -1]
    # label encode the target
    if label_encode:
        temp_target = LabelEncoder().fit_transform(temp_target)

    targets = np.array(temp_target, dtype=np.float16)
    targets = targets.reshape((targets.shape[0], 1))
    # under sampling
    if resample:
        rus = RandomUnderSampler(random_state=0, replacement=True)
        inputs, targets = rus.fit_resample(inputs, targets)

    # splitting
    # random_state=0 means that every time we have the same set of selected
    return train_test_split(inputs, targets, test_size=0.3, random_state=0) if split else (inputs, targets)

In [160]:
def main(dataset, method="NB"):
    global result
    if method == "NB":
        # no need to resample any thing
        X_train, X_test, y_train, y_test = ready_data(dataset, resample=False)
        # X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.3, random_state=0)
        result = naive_bayes_gaussian(X_train, y_train, X_test, y_test)
    elif method == "DT":
        pass
    elif method == "RF":
        pass
    elif method == "AB":
        pass
    elif method == "KNN":
        pass

    for title in result:
        print(f"{title} --> {result.get(title)}")



In [161]:
class NaiveBayesGaussian:
    def __init__(self):
        self.ones_num = 0
        self.zero_num = 0
        self.ones_mean = None
        self.zero_mean = None
        self.ones_std = None
        self.zero_std = None

    def fit(self, X, y):
        X = torch.tensor(X)
        y = torch.tensor(y)
        self.ones_num = torch.count_nonzero(y).item()
        self.zero_num = y.size()[0] - self.ones_num
        X_0 = torch.zeros((self.zero_num, X.shape[1]))
        X_1 = torch.zeros((self.ones_num, X.shape[1]))

        i = 0
        j = 0
        for col, cls in enumerate(y):
            if cls == 1:
                X_1[i, :] = X[col, :]
                i += 1
            else:
                X_0[j, :] = X[col, :]
                j += 1

        self.zero_mean = torch.mean(X_0, dim=0)
        self.ones_mean = torch.mean(X_1, dim=0)
        self.zero_std = torch.std(X_0, dim=0)
        self.ones_std = torch.std(X_1, dim=0)

    def predict(self, X):
        predictions = torch.ones((X.shape[0], 1))
        for i, instance in enumerate(X):
            if self.predict_instance(instance) == 0:
                predictions[i, 0] = 0
        return predictions.numpy()

    def predict_instance(self, x):
        score_0 = math.log(self.zero_num / (self.zero_num + self.ones_num))

        for (point, mean, std) in zip(x, self.zero_mean, self.zero_std):
            score_0 += norm.logpdf(point, loc=mean, scale=std)

        score_1 = math.log(self.ones_num / (self.zero_num + self.ones_num))
        for (point, mean, std) in zip(x, self.ones_mean, self.ones_std):
            score_1 += norm.logpdf(point, loc=mean, scale=std)

        if score_1 > score_0:
            return 1.0
        else:
            return 0.0


In [162]:
def get_result(true, predictions):
    (tn, fp), (fn, tp) = confusion_matrix(true, predictions)
    result = {
        "f": f1_score(true, predictions, pos_label=1.0),
        "recall": recall_score(true, predictions, pos_label=1.0),
        "accuracy": accuracy_score(true, predictions),
        "precision": precision_score(true, predictions, pos_label=1.0),
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "tp": tp
    }
    return result


In [163]:
def naive_bayes_gaussian(X, y, X_test, y_test):
    model = NaiveBayesGaussian()
    model.fit(X, y)

    predictions = model.predict(X_test)
    return get_result(y_test, predictions)


In [9]:
main(dataset, "NB")

f --> 0.5092180546726001
recall --> 0.3922624877571009
accuracy --> 0.7294076410795653
precision --> 0.7255434782608695
tn --> 3361
fp --> 303
fn --> 1241
tp --> 801


In [164]:
def weighted_impurity_temp(true_correct, true_wrong, false_correct, false_wrong):
    gini_true_leaf = 1 - (true_correct / (true_wrong + true_correct)) ** 2 - (
            true_wrong / (true_wrong + true_correct)) ** 2
    gini_false_leaf = 1 - (false_correct / (false_wrong + false_correct)) ** 2 - (
            false_wrong / (false_wrong + false_correct)) ** 2

    true_weight = (true_correct + true_wrong) / (true_correct + true_wrong + false_correct + false_wrong)
    false_weight = (false_correct + false_wrong) / (true_correct + true_wrong + false_correct + false_wrong)

    return gini_true_leaf * true_weight + gini_false_leaf * false_weight


def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value, int) or isinstance(value, float)

In [165]:
data = np.array(["ahmed", "kamal", "hepalla", 3,4.5,"howa","ahmed","ahmed","kamal",3])
training_data = np.array([
    [0, 3, 0],
    [1, 3, 0],
    [2, 1, 1],
    [2, 1, 1],
    [1, 3, 2.],
])
m_1 = [
    (np.array([105,39]),0),
    (np.array([34,125]),1)
]
m_2 = [
    (np.array([37,127]),0),
    (np.array([100,33]),1)
]
m_3 = [
    (np.array([92,31]),0),
    (np.array([45,129]),1)
]
state_quest = np.array([
    [220,1],
    [180,1],
    [225,1],
    [190,0],
    [155,0]
])

In [None]:
inputs ,targets = ready_data(dataset,split=False)

In [166]:
X_train, X_test, y_train, y_test = ready_data(dataset)

In [176]:
inputs , targets = training_data[:,0:-1] , training_data[:,-1]

In [177]:
inputs , targets = state_quest[:,0:-1] , state_quest[:,-1]

In [12]:
X = torch.tensor(X_train[:,0])
y = torch.tensor(y_train)

In [167]:
def weighted_impurity(counts):
    total_weighted_gini = 0
    sums = np.zeros(len(counts))
    for i, (leaf, _) in enumerate(counts):
        sums[i] = np.sum(leaf)
    for i, (leaf, _) in enumerate(counts):
        gini = 1
        for j in leaf:
            gini -= (j / sums[i]) ** 2
        weight = sums[i] / np.sum(sums)
        total_weighted_gini += gini * weight
    return total_weighted_gini

In [168]:
def count(x, y):
    # if is_numeric(x):
    merged_data = (torch.vstack((x, y))).t()
    sorted_data = merged_data[merged_data[:, 0].argsort()].detach().clone()

    classes = np.array(list(sorted(set(y.numpy()))))
    leaf = np.zeros(classes.shape)
    possible_gini_counts = []

    partition = len(sorted_data) / 100
    for part in np.arange(partition, 99 * partition, partition):
        part = int(part)
        avg = (sorted_data[part, 0] + sorted_data[part + int(partition), 0]) / 2
        count_stump = [(leaf.copy(), answer) for answer in [True, False]]

        for instance in sorted_data:
            cls_index = np.where(classes == instance[1].item())[0][0]
            if instance[0].item() < avg:
                # true:: left branch
                count_stump[0][0][cls_index] += 1
            else:
                # false:: right branch
                count_stump[1][0][cls_index] += 1

        possible_gini_counts.append([weighted_impurity(count_stump), avg, count_stump])
        # print(f"\t\tstump building in progress: {(part / (100 * partition)):.2%}")

    gini_avg_cutoff, separation_point, counts = sorted(possible_gini_counts, key=lambda record: record[0])[0]
    # print(f"gini: {gini_avg_cutoff}, separation point: {separation_point} \n--> {counts}")
    return (gini_avg_cutoff, float(separation_point.item())), (counts, classes)


In [200]:

class Stump:
    def __init__(self, X, y, weights, root_feature=0):
        # self._true_correct = 0
        # self._true_wrong = 0
        # self._false_correct = 0
        # self._false_wrong = 0
        # self.x =
        # self.y =
        # self.total_gini = weighted_impurity(self.counts)
        # build the stump
        print(f"\tstart creating trial stump {root_feature}#")
        self._root_feature = root_feature
        (self._total_gini, self._separation_point), (self._counts, self._classes) = count(X[:, root_feature], y)
        self._stump_weight = self.calc_stump_weight(X[:, root_feature], y, weights)
        self._new_weights = self.get_new_weights(X[:, root_feature], y, weights)
        print(f"\tfinish creating trial stump {root_feature}#")


    @property
    def new_weights(self):
        return self._new_weights

    @property
    def stump_weight(self):
        return self._stump_weight

    @property
    def root_feature(self):
        return self._root_feature

    @property
    def total_gini(self):
        return self._total_gini

    @property
    def counts(self):
        return self._counts

    @property
    def classes(self):
        return self._classes

    @property
    def separation_point(self):
        return self._separation_point
    # works
    def _predict(self, x):
        index = 1
        if x < self.separation_point:
            index = 0
        side = self.classes[int(self.counts[index][0].argsort()[-1])]
        return side

    # predict an instance using the stump, works
    def predict(self, x):
        x_instance = x[self.root_feature].item()
        return self._predict(x_instance)
    # works
    def calc_stump_weight(self, x, y, weights, error_term=1e-15):
        total_error = sum([((self._predict(x[i].item()) != y[i].item()) and weights[i].item()) for i in range(len(y))])
        weight = 0.5 * math.log((1 - total_error + error_term) / (total_error + error_term))
        return weight

    def get_new_weights(self, x, y, weights):
        new_weights = torch.clone(weights)
        exponents = torch.tensor([(-1. if (self._predict(x[i].item()) == y[i].item()) else 1.) for i in range(len(weights))]) * self.stump_weight
        new_weights = new_weights * (math.e ** exponents)
        return new_weights / new_weights.sum()

In [201]:
# get the best stump using the available dat
def get_best_stump(X, y, data_weights, features_not_included):
    print(f"create the {len(features_not_included)}# stump")
    stumps = {}
    for root_feature in (set(range(X.shape[1])) - set(features_not_included)):
        stump = Stump(X, y, data_weights, root_feature=root_feature)
        stumps[stump.total_gini] = stump
    chosen = stumps.get(sorted(stumps)[0])
    print(f"chosen stump with weight: {chosen.stump_weight}, and its gini: {chosen.total_gini}" )
    return chosen

In [188]:
X = torch.tensor(X_train)
y = torch.tensor(y_train)
weight = torch.ones(X.shape[0]) / float(X.shape[0])
stump = Stump(X, y,weight)

	start creating trial stump 0#
sum_weight:  1.0
total_error: 0.38513296796008945
stump_weight: 0.2339086951896266
	finish creating trial stump 0#


In [199]:
# print(weight)
# print(stump.new_weights) #didn't work
# stump.stump_weight
# stump.separation_point
# stump.classes
# stump.counts
# stump.total_gini
# stump.root_feature
# new_stump = Stump(X,y,weight)

0

In [202]:
get_best_stump(X,y,weight,[])

create the 0# stump
	start creating trial stump 0#
sum_weight:  1.0
total_error: 0.38513296796008945
stump_weight: 0.2339086951896266
	finish creating trial stump 0#
	start creating trial stump 1#
sum_weight:  1.0
total_error: 0.398376586381346
stump_weight: 0.2061169679659755
	finish creating trial stump 1#
	start creating trial stump 2#
sum_weight:  1.0
total_error: 0.451991880312562
stump_weight: 0.09631294401275386
	finish creating trial stump 2#
	start creating trial stump 3#
sum_weight:  1.0
total_error: 0.4680123864673078
stump_weight: 0.06406272192970792
	finish creating trial stump 3#
	start creating trial stump 4#
sum_weight:  1.0
total_error: 0.4687600100878626
stump_weight: 0.06256147281602917
	finish creating trial stump 4#
	start creating trial stump 5#
sum_weight:  1.0
total_error: 0.43842785176821053
stump_weight: 0.12377249688421173
	finish creating trial stump 5#
	start creating trial stump 6#
sum_weight:  1.0
total_error: 0.4029691314790398
stump_weight: 0.1965544279

<__main__.Stump at 0x21c4f191460>

In [206]:
for key in stumps:
    print(type(stumps.get(key).new_weights.sum()))

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
