In [19]:
import numpy as np

#多项式贝叶斯模型，离散特征
class My_MultinomialNB(object):      
    """
    alpha:平滑系数，为0是极大似然估计，为1是拉普拉斯平滑
    fit_prior:是否训练先验概率
    class_prior:类别的先验概率
    """
    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None):  
        self.alpha = alpha
        self.fit_prior = fit_prior
        self.class_prior = class_prior
        self.classes = None
        self.conditional_prob = None
    
    def _calculate_feature_prob(self, feature):
        values = np.unique(feature) #np.unique()返回一个新的无元素重复的增序数组或列表
        total_num = float(len(feature))
        value_prob = {}
        for v in values:
            #利用了features 和 v 二者array_like, 广播机制
            value_prob[v] = ((np.sum(np.equal(feature,v))+self.alpha)/(total_num + self.alpha*len(values)))
        return value_prob
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        #计算各个类的先验概率
        if self.class_prior == None:
            class_num = len(self.classes)
            if not self.fit_prior:
                self.class_prior = [1.0/class_num for _ in range(class_num)] #默认的先验概率为均分
            else:
                self.class_prior = []
                sample_num = float(len(y))
                for c in self.classes:
                    c_num = np.sum(np.equal(y,c)) #y 中类 c 的个数
                    self.class_prior.append((c_num + self.alpha)/(sample_num + c)) #alpha=1，拉普拉斯平滑
        self.conditional_prob = {}
        for c in self.classes:
            self.conditional_prob[c] = {} #已知类别为 c 
            for i in range(len(X[0])): #for each feature
                feature = X[np.equal(y,c)][:,i] #将训练数据中属于第c类的数据中的第i个特征拿出来整合 nX1
                self.conditional_prob[c][i] = self._calculate_feature_prob(feature) #计算先验条件概率
        return self
    
    def _get_xj_prob(self, values_prob, target_value):
        return values_prob[target_value]
    
    def _predict_single_sample(self,x):
        label = -1
        max_posterior_prob = 0
        
        for c_index in range(len(self.classes)):
            current_class_prior = self.class_prior[c_index]
            current_conditional_prob = 1.0
            feature_prob = self.conditional_prob[self.classes[c_index]]
            j = 0
            for feature_i in feature_prob.keys():
                current_conditional_prob *= self._get_xj_prob(feature_prob[feature_i], x[j])
                j += 1
                
            if current_class_prior * current_conditional_prob > max_posterior_prob:
                max_posterior_prob = current_class_prior * current_conditional_prob
                label = self.classes[c_index]
        return label
    
    def predict(self, X):
        if X.ndim == 1:
            return self._predict_single_sample(X)
        else:
            labels = []
            for i in range(X.shape[0]):
                label = self._predict_single_sample(X[i])
                labels.append(label)
            print("Predict result by My_NB:",np.array(labels))
            return labels
        
    def score(self,X,y):
        y_hat = self.predict(X)
        return np.sum(np.equal(y_hat,y))/len(y)

class My_GaussianNB(My_MultinomialNB):
    
    def _calculate_feature_prob(self, feature):
        mu = np.mean(feature)
        sigma = np.std(feature)
        return (mu, sigma)
    
    def _prob_gaussian(self, mu, sigma, x):
        return (1.0/(sigma*np.sqrt(2*np.pi))*np.exp(-(x-mu)**2/(2*sigma**2)))
    
    def _get_xj_prob(self,mu_sigma,target_value):
        return self._prob_gaussian(mu_sigma[0], mu_sigma[1], target_value)

In [20]:
#莺尾花的特征是连续变量，采用高斯模型
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd

iris = load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length','sepal width','petal length','petal width','label']
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [21]:
X, Y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
model = My_GaussianNB()
model.fit(X_train, y_train)
print("score by My_NB:",model.score(X_test,y_test))

Predict result by My_NB: [0 0 0 0 0 2 0 2 2 0 1 2 2 0 0 2 2 2 0 0 1 2 0 2 0 2 2 0 2 0]
score by My_NB: 0.9666666666666667


In [22]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
print("Predict result by predict:", clf.predict(X_test))
print("Score by sklearn_GaussianNB:", clf.score(X_test,y_test))

Predict result by predict: [0 0 0 0 0 2 0 2 2 0 1 2 2 0 0 2 2 2 0 0 1 2 0 2 0 2 2 0 2 0]
Score by sklearn_GaussianNB: 0.9666666666666667
