### Titanic的贝叶斯分类器版本

In [1]:
import numpy as np
import csv
import math

### 从CSV中读入数据集并进行简单映射处理

In [2]:
raw_data_list = []
with open('train.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        raw_data_list.append(row)

raw_data_list.pop(0)

# 存储所有数据的矩阵
raw_data_matrix = np.zeros((len(raw_data_list[0]), len(raw_data_list)))
raw_data_list = np.array(raw_data_list).T
raw_data_matrix[0] = raw_data_list[0].astype(np.float)
for i in range(len(raw_data_matrix[0])):
    # male为1，female为0
    raw_data_matrix[1][i] = 1 if raw_data_list[1][i] == 'male' else 2
for i in range(2, 6):
    raw_data_matrix[i] = raw_data_list[i].astype(np.float)
for i in range(len(raw_data_matrix[0])):
    if raw_data_list[6][i] == 'U':
        raw_data_matrix[6][i] = 0
    elif raw_data_list[6][i] == 'C':
        raw_data_matrix[6][i] = 1
    elif raw_data_list[6][i] == 'Q':
        raw_data_matrix[6][i] = 2
    elif raw_data_list[6][i] == 'S':
        raw_data_matrix[6][i] = 3
raw_data_matrix[7] = raw_data_list[7].astype(np.float)

### 分割训练集和测试集

In [3]:
m_train = int(len(raw_data_matrix[0]) * 0.7)
m_test = len(raw_data_matrix[0]) - m_train

X_train = raw_data_matrix[0:7, 0:m_train]
Y_train = raw_data_matrix[7:8, 0:m_train]

X_test = raw_data_matrix[0:7, m_train:]
Y_test = raw_data_matrix[7:8, m_train:]

### 预测
一边预测一边计算先验

In [30]:
def get_prior(Y):
    res = np.array([0., 0.])
    for i in Y[0]:
        if i == 0:
            res[0] = res[0] + 1
        else:
            res[1] = res[1] + 1
    res = res / Y_train.shape[1]
    return res

# 计算P(f_i|c)高斯朴素贝叶斯
def gnb(total_x, total_y, f, c):
    total_x = np.mat(total_x)
    assert total_x.shape[0] == 1
    assert total_y.shape[0] == 1
    assert total_x.shape[1] == total_y.shape[1]
    avg = 0.
    for i in range(total_y.shape[1]):
        if total_y[0,i] == c:
            avg = avg + total_x[0,i]
    avg = avg / total_y.shape[1]
    mu = 0.
    for i in range(total_y.shape[1]):
        if total_y[0,i] == c:
            mu = mu + (total_x[0,i] - avg)**2
    mu = mu / total_y.shape[1]
    return (1 / math.sqrt(2 * math.pi * mu))*np.exp(-(f - avg)**2 / (2*mu))

# 计算P(f_i|c)
def nb(total_x, total_y, f, c):
    total_x = np.mat(total_x)
    assert total_x.shape[0] == 1
    assert total_y.shape[0] == 1
    assert total_x.shape[1] == total_y.shape[1]
    sum = 0.
    target = 0.
    for i in range(total_y.shape[1]):
        if total_y[0,i] == c:
            sum = sum + 1
            if total_x[0,i] == f:
                target = target + 1
    return target / sum

def predict_single(x_train, y_train, x, prior):
    assert x.shape[0] == 7
    assert x.shape[1] == 1
    # 计算c为0时的后验
    pcf = np.zeros((2, 7))
    pcf[0][0] = nb(x_train[0], y_train, x[0][0], 0)
    pcf[0][1] = nb(x_train[1], y_train, x[1][0], 0)
    pcf[0][2] = gnb(x_train[2], y_train, x[2][0], 0)
    pcf[0][3] = nb(x_train[3], y_train, x[3][0], 0)
    pcf[0][4] = nb(x_train[4], y_train, x[4][0], 0)
    pcf[0][5] = gnb(x_train[5], y_train, x[5][0], 0)
    pcf[0][6] = nb(x_train[6], y_train, x[6][0], 0)
    pc0f = 1.
    for i in range(7):
        pc0f = pc0f * pcf[0][i]
    pc0f = pc0f * prior[0]
    
    # 计算c为1时的后验
    pcf[1][0] = nb(x_train[0], y_train, x[0][0], 1)
    pcf[1][1] = nb(x_train[1], y_train, x[1][0], 1)
    pcf[1][2] = gnb(x_train[2], y_train, x[2][0], 1)
    pcf[1][3] = nb(x_train[3], y_train, x[3][0], 1)
    pcf[1][4] = nb(x_train[4], y_train, x[4][0], 1)
    pcf[1][5] = gnb(x_train[5], y_train, x[5][0], 1)
    pcf[1][6] = nb(x_train[6], y_train, x[6][0], 1)
    pc1f = 1.
    for i in range(7):
        pc1f = pc1f * pcf[1][i]
    pc1f = pc1f * prior[1]
    return 0 if pc0f > pc1f else 1

def predict(x_train, y_train, x_test, y_test):
    success_sum = 0.
    prior = get_prior(y_train)
    for i in range(x_test.shape[1]):
        p = predict_single(x_train, y_train, x_test[:,i:i+1], prior)
        if p == y_test[0][i]:
            success_sum = success_sum + 1
    print('准确率：' + str(success_sum / x_test.shape[1]))

### 带入数据计算

In [31]:
predict(X_train, Y_train, X_test, Y_test)

准确率：0.7985074626865671
