## 实验十一：实现朴素贝叶斯算法对adult数据集进行分类

要求：完成probability函数

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm

### 数据集准备

In [2]:
col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', '50K']
adult_data_df = pd.read_csv('adult.data', index_col=False, header=None, names=col_names, sep=', ', engine='python')

adult_test_df = pd.read_csv('adult.test', skiprows=[0], index_col=False, header=None, names=col_names, sep=', ', engine='python')
adult_test_df['50K'] = adult_test_df['50K'].map(lambda x: x[:-1])  # 去除行末的点

In [None]:
adult_test_df.head()

### 用众数填补缺失值，或者可以直接删除含有缺失值的行

In [None]:
# 训练集补充缺失值
mode_df = adult_data_df.mode()  # 计算每一列的众数
for col in adult_data_df:
    if '?' in adult_data_df[col].tolist():
        missing_count = adult_data_df[col].value_counts()['?']   # 缺失值的个数
        adult_data_df[col] = adult_data_df[col].replace('?', mode_df[col][0])
        print('{} : {} 个缺失值被替换为 "{}"'.format(col, missing_count, mode_df[col][0]))

In [None]:
# 测试集补充缺失值
mode_df = adult_test_df.mode()
for col in adult_test_df:
    if '?' in adult_test_df[col].tolist():
        missing_count = adult_test_df[col].value_counts()['?']   # 缺失值的个数
        adult_test_df[col] = adult_test_df[col].replace('?', mode_df[col][0])
        print('{} : {} 个缺失值被替换为 "{}"'.format(col, missing_count, mode_df[col][0]))

### 基于sklearn 的 GaussianNB

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# 数据编码
def encode_features(df, columns):
    for col in columns:
        df[col] = LabelEncoder().fit_transform(df[col])
    return df
categorical_features = adult_data_df.select_dtypes(include=['object']).columns
adult_data_df = encode_features(adult_data_df, categorical_features)
adult_test_df = encode_features(adult_test_df, categorical_features)

# 数据准备
X_train = adult_data_df.drop('50K', axis=1)
y_train = adult_data_df['50K']
X_test = adult_test_df.drop('50K', axis=1)
y_test = adult_test_df['50K']

# 模型训练
model = GaussianNB()
model.fit(X_train, y_train)

# 模型预测
y_pred = model.predict(X_test)

# 模型评估
accuracy = accuracy_score(y_test, y_pred)
print(f'准确率: {accuracy:.2f}')

### 自己构建高斯分布的朴素贝叶斯分类器

高斯朴素贝叶斯 (Gaussian Naive Bayes):
- 适用：连续型特征
- 假设：连续型特征服从正态分布

In [6]:
class NaiveBayes:
    def __init__(self, continuous_attrs): # 初始化时指定连续属性列表
        self.continuous_attrs = continuous_attrs
        self.stats_cache = {} # 缓存统计信息，作为训练结果

    def fit(self, data, target_col):
        """
        训练模型，缓存统计信息（均值、方差、频率）
        """
        self.target_col = target_col # Y的列名，带分类的目标列名
        for class_ in data[target_col].unique():
            class_df = data[data[target_col] == class_]
            # 统计每一个类别下的分布信息
            self.stats_cache[class_] = { 
                'prior': len(class_df) / len(data), # 类别先验概率
                'mean': class_df[self.continuous_attrs].mean(), # 连续属性的均值
                'var': class_df[self.continuous_attrs].var(), # 连续属性的方差
                'freq': {
                    col: class_df[col].value_counts(normalize=True).to_dict() for col in data.columns if col not in self.continuous_attrs and col != target_col #此处代码逻辑同学们自行分析
                } # 离散属性的频率
            }

    def probability(self, value, attr, stats):
        """
        计算某个属性值的概率
        输入：
            value: 属性值
            attr: 属性名
            stats: 统计信息,来自训练结果self.stats_cache
        输出：
            概率值(0到1的浮点数单值)
        实现：
            若属性是连续型(attr在self.continuous_attrs中), 则使用高斯分布(scipy.stats.norm.pdf)计算概率
            若属性是离散型(attr不在self.continuous_attrs中), 则直接从统计信息中获取value对应的频率并返回
        """
        if attr in self.continuous_attrs:  # 连续型属性
            '''
            your code here
            '''
            pass
        else:  # 离散型属性，对于离散型特征，直接统计频率分布，并返回相应的条件概率
            '''
            your code here
            '''
            pass
            

    def predict(self, sample):
        """
        对一个样本进行预测
        """
        max_prob = 0
        max_class = None
        for class_, stats in self.stats_cache.items(): # 遍历每个类别
            prob = stats['prior']  # 类别先验概率
            for attr, value in sample.items():
                if attr == self.target_col:
                    continue
                prob *= self.probability(value, attr, stats) # 计算该类别下的联合概率
            if prob > max_prob: # 更新最大概率和类别
                max_prob = prob
                max_class = class_
        return max_class

### 模型训练

In [7]:
continuous_attrs = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] # 连续属性集
nb = NaiveBayes(continuous_attrs) # 初始化一个朴素贝叶斯分类器
nb.fit(adult_data_df, '50K') # 训练模型

### 模型预测

设置测试集的长度，调试代码时可以设置小一些，例如test_len = 100，交作业时用原长度

In [8]:
test_len = len(adult_test_df)

In [None]:
test_data = adult_test_df.iloc[:test_len]
correct_count = 0
for i in range(len(test_data)):
    test_sample = test_data.iloc[i].to_dict()
    prediction = nb.predict(test_sample)
    if prediction == test_sample['50K']:
        correct_count += 1
print('准确率：{:.3%}'.format(correct_count / len(test_data)))