In [232]:
import pandas as pd
import numpy as np

# 1.数据预处理

In [259]:
#读取数据
data = pd.read_csv(r'D:\data\mushroom-classification\mushrooms.csv')
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [239]:
#对标称型数据数值化
n,m = data.shape
for i in range(m):
    data[data.columns[i]] = pd.factorize(data[data.columns[i]])[0]
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
2,1,1,0,2,0,2,0,0,1,1,...,0,0,0,0,0,0,0,1,1,2
3,0,0,1,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,3,1,3,0,1,1,0,...,0,0,0,0,0,0,1,1,2,1


In [240]:
#数据集划分
np.random.seed(1)
random_indexs = np.random.permutation(n)
train_index = random_indexs[:int(0.7*n)]
test_index = random_indexs[int(0.7*n):]
train_data = data.iloc[train_index]
train_labels = train_data['class']
train_data = train_data[train_data.columns[1:]]
test_data = data.iloc[test_index]
test_labels = test_data['class']
test_data = test_data[test_data.columns[1:]]

# 2.模型定义

In [255]:
class NaiveBayes():
    def __init__(self, data, labels):
        '''
        lamba:对条件概率的极大似然估计会产生偏差，加上正数lambd，lambd=1时称为拉普拉斯平滑
        data,labels: data:(num_samples,features)——（样本个数，特征个数）;labels:标签
        class_num:类的取值个数
        prior_probs:所有类别的先验概率
        features_values:每个特征的取值个数(features,1)
        classes_data:所有类别对应的数据集（即通过类别划分数据集）
        '''
        self.lambd = 1
        self.data = data
        self.labels = labels
        self.num_samples = labels.shape[0]
        self.features = data.shape[1]
        self.class_num = len(np.unique(self.labels))
        self.prior_prob()
        self.features_vals()
        self.computer_classes_data()
        
    def computer_classes_data(self):
        #通过类别划分数据集
        self.classes_data  = []
        for i in range(self.class_num):
            self.classes_data.append(self.data[np.where(self.labels == i)[0], :])

    def features_vals(self):
        #计算所有特征可取值数
        self.features_values = np.zeros(self.features)
        for i in range(self.features):
            t = np.unique(self.data[:,i])
            self.features_values[i] = len(np.unique(self.data[:,i]))

    def prior_prob(self):
        #计算所有类别的先验概率
        self.prior_probs = np.zeros(self.class_num)
        for i in range(self.class_num):
            self.prior_probs[i] = ((self.labels == i).sum() + self.lambd) / (self.num_samples + self.class_num * self.lambd)

    def cond_prob(self, x, y):
        #计算类别y的条件概率
        data = self.classes_data[y]
        y_num = data.shape[0]
        p = 1
        for i in range(self.features):
            p *= (((data[:, i] == x[i]).sum() + self.lambd) / (y_num + self.features_values[i] * self.lambd))
        return p

    def pred(self, x):
        #对新样本进行预测
        cond_p = np.zeros(self.class_num)
        p = np.array(self.class_num)
        for i in range(self.class_num):
            cond_p[i] = self.cond_prob(x, i)
        p = self.prior_probs * cond_p
        return np.argmax(p)

# 模型检验

In [262]:
train_data = np.array(train_data)
train_labels = np.array(train_labels)
model = NaiveBayes(train_data,train_labels)

num = len(test_labels)
correct = 0
test_data = np.array(test_data)
test_labels = np.array(test_labels)
for i in range(len(test_labels)):
    ypred = model.pred(test_data[i])
    if ypred == test_labels[i]:
        correct += 1
print('测试集正确率:%f %%' %(correct*100 / num))

测试集正确率:94.667760 %
