In [153]:
import numpy as np 
from numpy import array
import pandas as pd 
from sklearn import datasets
import statistics
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt 

# Setosa, Versicolour, and Virginica
# 0, 1 or 2
iris = datasets.load_iris()
# Sepal Length, Sepal Width, Petal Length and Petal Width
X = iris.data[:100, :]  # taking all 4 attributes within Setosa and Versicolour flower only
y = iris.target[ : 100] # taking only the Setosa and Versicolour flowers

posDistrList, negDistrList = fit(X, y)
testMatrix = np.array([[5.2, 4.1, 1.5, 0.1], 
[5.3, 4.2, 1.7, 0.2],
[5.6, 7.1, 1.4, 0.5],
[5.6, 4.1, 1.8, 0.3],
[5.2, 4.1, 1.5, 0.8],
[5.2, 4.1, 1.5, 0.9], [5.2, 5.1, 1.1, 0.1], [6.2, 4.1, 1.5, 0.2], [3.2, 5.1, 1.5, 0.1], [7.2, 2.1, 1.5, 0.1]])

predictMatrix(testMatrix, posDistrList, negDistrList)


a dataset row:  [5.2 4.1 1.5 0.1 0. ]
different classes: [1, 0]
[NormalDist(mu=5.936, sigma=0.5161711470638634), NormalDist(mu=2.77, sigma=0.3137983233784114), NormalDist(mu=4.26, sigma=0.46991097723995795), NormalDist(mu=1.3259999999999998, sigma=0.19775268000454405)]
[NormalDist(mu=5.006, sigma=0.3524896872134513), NormalDist(mu=3.428, sigma=0.37906436909628866), NormalDist(mu=1.462, sigma=0.17366399648018407), NormalDist(mu=0.24600000000000002, sigma=0.10538558938004566)]
prodasdasd::  [3.45837797e-01 1.63426384e-01 2.70287470e-22 3.43239641e-02
 9.01182285e-07 3.91676400e-09 1.15729027e-05 3.07959783e-03
 2.30171943e-10 1.61896423e-11]
probamatrix::  [[3.45837797e-01 5.56484982e-21]
 [1.63426384e-01 3.98991082e-19]
 [2.70287470e-22 4.52114234e-54]
 [3.43239641e-02 1.36734818e-16]
 [9.01182285e-07 3.59226704e-14]
 [3.91676400e-09 1.21336696e-13]
 [1.15729027e-05 2.21652932e-31]
 [3.07959783e-03 2.72993795e-19]
 [2.30171943e-10 1.03477550e-34]
 [1.61896423e-11 6.24731273e-19]]


In [125]:
# TODO have posDistrList as global variable
def fit(X, y):
    # Get the amount of different features e.g, 4
    nrOfFeatures = np.size(X, axis=1)
    # Make y shape == X shape
    yReshaped = np.reshape(y, (np.size(y), 1))
    # Append label to each featurevector
    dataset = np.append(X, yReshaped, axis=1)
    print("a dataset row: ", dataset[32])
    # Find the labels e.g [1, 0]
    classes = list(set(y))
    classes.sort(reverse=True)
    print("different classes:", classes)
    # Separate the classes into positives and negatives
    # Where 1 represent the positive class and vice versa
    filterPositiveFeatureVectors = dataset[:, -1] == classes[0]
    filterNegativeFeatureVectors = dataset[:, -1] == classes[1]
    positiveFeatureVectors = dataset[filterPositiveFeatureVectors]
    negativeFeatureVectors = dataset[filterNegativeFeatureVectors] 
    # Slicing away the labels
    posFeatureMatrix = positiveFeatureVectors[:, : -1]
    negFeatureMatrix = negativeFeatureVectors[:, : -1]
    # Transposing the matrix to sort the same kind of feature in the same vectors
    # [f0:[1,5,9,13,17,19,21,23,25], f1:[2,6,10,14,18,20,22,24], ...]
    posFeatureMatrixTransposed = np.transpose(posFeatureMatrix)
    negFeatureMatrixTransposed = np.transpose(negFeatureMatrix)
    # Calculating the normal distribution on all feature vectors
    # Placing them in two separate distribution lists
    # TODO why using gaussian naive bayes ? 
    posDistrList = list(map(statistics.NormalDist.from_samples, posFeatureMatrixTransposed))
    print(posDistrList)
    negDistrList = list(map(statistics.NormalDist.from_samples, negFeatureMatrixTransposed))
    print(negDistrList)

    return posDistrList, negDistrList



In [158]:
# Bayes theorem to calculate probabilty with the trained distributions. 
# Using pdf as refered to Probability density function from the normal distributions.
# As instructed in https://docs.python.org/3/library/statistics.html at the bottom.
# Skipping the evidence P(data) factor, because it is just a constant over all the factors which is redundant.
def calculatePosterior(featureVector, distrList, prior):
    posterior = prior
    for feature, distrObj in zip(featureVector, distrList):
        posterior *= distrObj.pdf(feature)
    return posterior

# returns [[negProb, posProb], [negProb, posProb], [negProb, posProb]...]
# newDataMatrix: [[5.2, 4.1, 1.5, 0.1], [5.2, 4.1, 1.5, 0.1] ...]
# posDistr: [obj, obj, obj, obj]
def predictMatrix(newDataMatrix, posDistrList, negDistrList):
    # posteriorPos = 0.5 * posDistrList[0].pdf(5.2) * posDistrList[1].pdf(4.1)...
    # posteriorNeg = 0.5 * negDistrList[0].pdf(5.2) * negDistrList[1].pdf(4.1)...
    # -> [negProb, posProb]
    
    posProbVector = np.array([])
    negProbVector = np.array([])
    for vector in newDataMatrix:
        posPosterior = calculatePosterior(vector, posDistrList, 0.5)
        negPosterior = calculatePosterior(vector, negDistrList, 0.5)
        posProbVector = np.append(posProbVector, posPosterior)
        negProbVector = np.append(negProbVector, negPosterior)
    probaMatrix = np.stack((negProbVector, posProbVector), axis=1)
    #print("probamatrix:: ", probaMatrix)
    return probaMatrix

In [98]:
def true_false_positive(threshold_vector, y_test):
    # predicted class 1, actual class 1
    true_positive = np.equal(threshold_vector, 1) & np.equal(y_test, 1)
    true_negative = np.equal(threshold_vector, 0) & np.equal(y_test, 0)
    false_positive = np.equal(threshold_vector, 1) & np.equal(y_test, 0)
    false_negative = np.equal(threshold_vector, 0) & np.equal(y_test, 1)

    tpr = true_positive.sum() / (true_positive.sum() + false_negative.sum())
    fpr = false_positive.sum() / (false_positive.sum() + true_negative.sum())

    return tpr, fpr

In [None]:
def calculateTprFpr(scoreMatrix):
    maxScore = max(scoreList)
    minScore = min(scoreList)
    thresholdLow = np.arange(minScore, 1.0, (1.0-minScore)/10000.0)
    thresholdHigh = np.arange(1.0, maxScore, (maxScore-1.0)/10000.0)
    thresholdVector = np.append(thresholdLow, thresholdHigh)
    # thresholdVector: [0.1, 0.01, 0.001...]
    # scoreMatrix: [[score, y], [score, y] ...100]
    scoreMatrix = [[1, 0], [2, 1], [3, 0]]
    threshold_vector = [0,1,2,4]
    score = [1,2,3,4]
    truths = [0,1,1,1]
    for threshold in thresholdVector:
        true_positive = np.greater_equal(score, threshold) & np.equal(truths, 1) # predicted class 1, actual class 1
        true_negative = np.greater_equal(score, threshold) & np.equal(truths, 0) # predicted class 0, actual class 0
        false_positive = np.greater_equal(score, threshold) & np.equal(truths, 0) # predicted class 1, actual class 0
        false_negative = np.greater_equal(score, threshold) & np.equal(truths, 1) # predicted class 0, actual class 1
        tpr = true_positive.sum() / (true_positive.sum() + false_negative.sum())
        fpr = false_positive.sum() / (false_positive.sum() + true_negative.sum())
        tprVector.append(tpr)
        fprVector.append(fpr)
        print(true_positive)
        print(tpr)

    

    return tprVector, fprVector

In [None]:
def printROCcurve():
    return 1

In [171]:
# Cross validation 
k = 10
kf = KFold(n_splits=k, random_state=None, shuffle=True)

partitions = 10
sns.set()

scoreMegaVector = np.array([])
for train_i, test_i in kf.split(X):
    X_train = X[train_i, :]
    y_train = y[train_i]
    X_test = X[test_i, : ]
    y_test = y[test_i]

    # Training the iris dataset, where normal distributions are calculated.
    posDistrList, negDistrList = fit( X_train, y_train)
    
    # TODO Make predict to return array of probablities
    # Predicts class: returns [[negProb, posProb], [negProb, posProb], [negProb, posProb] ...10]
    probaMatrix = predictMatrix(X_test, posDistrList, negDistrList)
    print("proba matrixe: ", probaMatrix)

    # Score = neg proba / pos proba
    # score: [score1, score2, score3...10]
    score = np.apply_along_axis(lambda probaVector: probaVector[0]/probaVector[1], 1, probaMatrix)
    print("SCORE vector::: ", score)
    # y_test: [1,0,0,1,0 ...10]
    # Append truthvalue to each score
    # scoreTruthMatrix: [[score, y], [score, y], ...10]
    scoreTruthMatrix = np.stack((score, y_test), axis=1)
    print("ScoretruthMatrix:: ", scoreTruthMatrix)

    # append to global scorelist (size 100)
    # scoreMegaMatrix: 
    scoreMegaVector = np.append(scoreMegaVector, scoreTruthMatrix)

# reshape megaVector to Matrix: [[score, y], [score, y] ...100]
scoreMegaMatrix = scoreMegaVector.reshape(-1, 2)
print("scoremegamatrix::: ", scoreMegaMatrix)

# threshhold list
# for threshold
    # for score, truthvalue in scorelist:
        # tp, fp
    # tpr, fpr
    # append to tprlist, fprlist
# tprlist, fprlist
# print roc curve


  
    



a dataset row:  [5.5 3.5 1.3 0.2 0. ]
different classes: [1, 0]
[NormalDist(mu=5.933333333333334, sigma=0.5040743090962969), NormalDist(mu=2.7644444444444445, sigma=0.30833538082858264), NormalDist(mu=4.262222222222222, sigma=0.4391566757531825), NormalDist(mu=1.3177777777777777, sigma=0.19103651351233747)]
[NormalDist(mu=4.988888888888889, sigma=0.3562571991120908), NormalDist(mu=3.413333333333333, sigma=0.36531431049788543), NormalDist(mu=1.4555555555555555, sigma=0.18034758807716983), NormalDist(mu=0.24222222222222223, sigma=0.10550503841671795)]
proba matrixe:  [[3.94917431e+000 9.77347118e-020]
 [5.76487611e-003 1.69144415e-020]
 [1.80521160e+000 5.03474173e-017]
 [1.21165113e+000 2.13032419e-015]
 [4.35933521e+000 2.79466101e-018]
 [1.69928958e-116 1.88815212e-002]
 [1.58269810e-123 6.98001109e-003]
 [1.10195969e-084 2.27255366e-001]
 [1.89241585e-069 5.42194693e-001]
 [1.12698379e-031 1.43915199e-003]]
SCORE vector:::  [4.04070800e+019 3.40825684e+017 3.58550983e+016 5.68763728e