In [257]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pprint import pprint

In [258]:
def create_data():
    iris = load_iris()
    data = pd.DataFrame(iris.data, columns=[i.split(' (')[0].replace(' ','_') for i in iris.feature_names])
    data['label'] = iris.target
    X = np.array(data.iloc[:,:-1])
    y = np.array(data.iloc[:,-1])
    y = np.array([1 if i==0 else -1 for i in y])
    return X, y

In [259]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [260]:
X_train[0],y_train[0]

(array([6.4, 2.8, 5.6, 2.2]), -1)

In [9]:
# 高斯朴素贝叶斯
import math
from functools import reduce
class NaiveBayes(object):
    def __init__(self):
        self.data = None
    
    def mean(self, X):
        return sum(X) / float(len(X))
    
    def stdev(self, X):
        avg = self.mean(X)
        return np.sqrt( sum([(x-avg)**2 for x in X]) / float(len(X)) )
    
    def separateByClass(self, X, y):
        labels = list(set(y))
        separated = {i:[] for i in labels}
        for label, value in zip(y, X):
            separated[label].append(value)
        return separated
    
    def calculateProbability(self, x, mean, stdev):
        return (1 / (math.sqrt(2*math.pi) * stdev)) * np.exp( -(x-mean)**2 / (2 * stdev**2) )
    
    def calculateClassProbabilities(self, x, mean, stdev):
        return reduce(lambda x,y: x*y, self.calculateProbability(x ,mean, stdev))
    
    def fit(self, X, y):
        self.rdim, self.cdim = X.shape
        self.data = self.separateByClass(X, y)
        
    def predict(self, x):
        if not self.data: raise Exception('the model need train data')
        probability = {label:self.calculateClassProbabilities(x ,self.mean(value),self.stdev(value)) \
                for label,value in self.data.items()}
        return list(probability.keys())[list(probability.values()).index(max(probability.values()))]

    def score(self, X_test, y_test):
        right_cnt = [1 for X, y in zip(X_test, y_test) if y == self.predict(X)]
        return sum(right_cnt)/float(len(X_test))

In [10]:
model = NaiveBayes()
model.fit(X,y)
model.predict(X_test[2])

-1

In [11]:
model.score(X_test, y_test)

1.0

In [241]:
class MultinomialNaiveBayes(object):
    def __init__(self):
        self.priori = None
        self.cond = None
        self.result = None
    
    def separateByClass(self, X, y):
        labels = list(set(y))
        separated = {i:[] for i in labels}
        for label, value in zip(y, X):
            separated[label].append(value)
        return separated
    
    def priori_probability(self, separated):
        return {k:float(len(v))/float(self.rdim) for k, v in separated.items()}
            
    def conditional_probability(self, separated):
        pro = {}
        for k,v in separated.items():
            pro[k], v = {}, np.array(v)
            for i in range(self.cdim):
                pro[k][i] = {}
                for j in set(list(v[:,i])):
                    pro[k][i][j] = list(v[:,i]).count(j) / len(list(v[:,i])) if not pro[k][i].get(j) else pro[k][i][j]
        return pro
    
    def fit(self, X, y):
        self.rdim, self.cdim = X.shape
        separated = self.separateByClass(X, y)
        self.priori = self.priori_probability(separated)
        self.cond = self.conditional_probability(separated)
        self.result = self.cond.keys()
    
    def _predict_one(self, row):
        max_pro = 0
        true_label = None
        for y in self.result:
            tmp_pro = 1
            for i,j in enumerate((row)):
                tmp_pro *= self.cond[y][i][j]
            if tmp_pro > max_pro:
                max_pro = tmp_pro
                true_label = y
        return true_label
        
    def predict(self, X_test):
        if not self.priori or not self.cond or not self.result: 
            raise Exception('the model need train data')
        test_cdim = X_test.shape[1]
        if test_cdim != self.cdim: 
            raise Exception('predict cols number[%s] not match to the model cols number[%s]'%(test_cdim, self.cdim)) 
        pro = np.array([])
        for row in range(test_cdim):
            pro = np.append(pro,self._predict_one(X_test[row])) 
        return pro

In [242]:
data = np.array([
    [1,1,1,1,1,2,2,2,2,2,3,3,3,3,3],
    ['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L'],
    [-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
])
X, y = data[:2].T, data[-1].T

In [243]:
model = MultinomialNaiveBayes()
model.fit(X, y)
test = np.array([[2,'S'], [1,'L']])
model.predict(test)

array(['-1', '1'], dtype='<U32')

In [246]:
data = np.array([
    [1,1,1,1,1,2,2,2,2,2,3,3,3,3,3],
    [4,5,5,4,4,4,5,5,6,6,6,5,5,6,6],
    [-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
])
X, y = data[:2].T, data[-1].T

In [255]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
clf.predict(np.array([[2,4]]))

array([1])

In [261]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [262]:
clf.score(X_test, y_test)

1.0

In [265]:
clf.predict(np.array([[4.4,  3.2,  1.3,  0.2]]))

array([1])