# 实验1 贝叶斯分类和knn

本次实验需要的所有包：

In [95]:
import numpy as np
import pandas as pd 
import jieba, os
import random
import matplotlib
from IPython.display import display

## 数据预处理

*自然语言处理？*

1.输入语料库并处理

  读取本地语料库。具体方法为从主文件开始读取子文件夹并读取子文件夹中的每一个txt的路径并输入到处理函数中

In [116]:
#读取所有子文件夹
def getdir(folder):
    subdir = []
    for root, dirs, files in os.walk(folder):
        for dir in dirs:
            subdir.append(os.path.join(root, dir))
    return subdir
#读入本文件夹所有的txt文件名
def getdata(path):
    txt_file = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".txt"):
                txt_file.append(os.path.join(root, file))
    return txt_file
folder = 'D:/机器学习实验/1/data'
txt_file = [getdata(path) for path in getdir(folder)]
label = np.array(['art']*742+['literature']* 34+ ['education']*61+['philosophy']*45 +['history']*468)

### 1.处理文本内容

1.读入文件，用jieba划分词，读取blacklist进行停用词筛选。

*并不用统计出现次数。。只需要统计有没有*

In [26]:
#处理文本内容
#预测结果不准可能是噪声过多
from collections import defaultdict
def process(filename, orig):
    global blacklist, result
    op = []
    
    # 打开文件并读取每一行
    with open(filename, 'r', encoding='ansi', errors='ignore') as file:
        lines = file.readlines()
    
    # 处理每一行
    for line in lines:
        seg_list = jieba.cut(line.strip(), cut_all=False)
        for seg in seg_list:
            if seg not in blacklist:
                if seg not in result:
                    # 创建词的唯一索引（这里用 defaultdict 会自动处理）
                    result[seg] = len(result)
                # 把原文输入
                op.append(seg)
    
    orig.append(op)
       

2.读入停用词库(blacklist)

In [27]:
#停用词库，参考哈工大、百度等停用词库 针对本实验进行调整
def load_blacklist(blacklist_file):
    blacklist = []
    with open(blacklist_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            blacklist.append(line.rstrip('\n'))
    return blacklist
blacklist = load_blacklist('D:/机器学习实验/1/blacklist.txt')
result = defaultdict(lambda: len(result))
# txt_file = getdata('D:/机器学习实验/1/test')
orig = []
for i in txt_file:
    for j in i:
        process(j, orig)

### 2.构建onehot文本向量

这种方法便于计算欧式距离。

onehot表示的二进制向量：这个词是否出现

**处理后的原文和特征词比较，出现：1**

**使用np.array存数组，pd.dataframe对应向量与标签**

In [75]:
#使用onehot进行编码。每个文件生成一个向量数组，
#根据字典总结出特征词（关键字），再用向量标识
# 输入到数据集中再进行划分？？
def encode(orig): 
    sum = []
    for i in orig:
        vector = [0] * len(result)
        for j in i:
            if j in result:
                vector[result[j]] = 1
        sum.append(vector)
    return sum
dataset = np.array(encode(orig))
print(dataset)
# df = pd.DataFrame({'vector': list(dataset), 'label': label})
# print(df)

[[1 1 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 1]]


### 3.划分测试集与训练集

*不建议使用scikit-learn*

In [129]:
#设置测试集比例
test_size = 0.3
data_size = len(dataset)
test_size = int(data_size * test_size)
#生成随机索引
indices = list(range(data_size))
random.shuffle(indices)

test_indices = indices[:test_size]
train_indices = indices[test_size:]

x_train = dataset[train_indices]
y_train = label[train_indices]

x_test = dataset[test_indices]
y_test = label[test_indices]

## 学习模型构建

### knn:

选出前k个最近的向量与标签，比较标签的数量判断分类


In [124]:
class KNNWithProbabilities:
    def __init__(self, k):
        self.k = k
    
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train
    
    def dist(self, x1, x2):
        return np.sqrt(np.sum(np.square(x1 - x2), axis=1))
    
    def predict(self, x_test):
        predictions = []
        probabilities = []
        
        for x in x_test:
            # 计算测试样本与所有训练样本之间的距离
            distances = self.dist(self.x_train, x)
            # 找到距离最近的k个点的索引
            nearest_ids = np.argsort(distances)[:self.k]
            # 找到这些点的标签
            nearest_labels = self.y_train[nearest_ids]
            # 统计最近k个点中每个标签出现的次数
            unique, counts = np.unique(nearest_labels, return_counts=True)
            # 计算概率
            class_probabilities = counts / self.k
            probabilities.append(class_probabilities)
            # 找到出现次数最多的标签
            prediction = unique[np.argmax(counts)]
            predictions.append(prediction)
        
        return predictions, probabilities
    
knn_with_probabilities = KNNWithProbabilities(k=20)
knn_with_probabilities.fit(x_train, y_train)
knn_prediction, knn_probabilities = knn_with_probabilities.predict(x_test)


### 朴素贝叶斯算法

1.计算先验概率P(c): 训练集中每个类别的比例

2.计算条件概率P(x|C) C中x出现的频率，使用拉普拉斯平滑处理

3.利用条件概率分类，p(a1|b1b2...) > p(a2|b1b2...) a1 

In [125]:
class NaiveBayes:
    def __init__(self):
        self.classes = None
        self.parameters = None
        self.prior_probabilities = None

    def fit(self, x_train, y_train):
        self.classes = np.unique(y_train)
        self.parameters = []
        self.prior_probabilities = {}

        for c in self.classes:
            x_c = x_train[y_train == c]
            self.parameters.append({
                "mean": x_c.mean(axis=0),
                "var": x_c.var(axis=0) + 1e-9  # Add epsilon to avoid division by zero
            })
            self.prior_probabilities[c] = len(x_c) / len(x_train)

    def log_pdf(self, x, mean, var):
        log_coefficient = -0.5 * np.log(2 * np.pi * var)
        log_exponent = -((x - mean) ** 2 / (2 * var))
        return log_coefficient + log_exponent

    def predict(self, x_test):
        log_prior = np.log(np.array([self.prior_probabilities[c] for c in self.classes]))
        probabilities = []
        predictions = []

        for x in x_test:
            log_posterior_probs = []
            class_probabilities = []
            for i, c in enumerate(self.classes):
                mean = self.parameters[i]["mean"]
                var = self.parameters[i]["var"]
                log_likelihood = np.sum(self.log_pdf(x, mean, var))
                log_posterior = log_likelihood + log_prior[i]
                log_posterior_probs.append(log_posterior)
                class_probabilities.append(log_posterior)
            posterior_probs = np.exp(log_posterior_probs - np.max(log_posterior_probs))  # 避免数值溢出
            probabilities.append(posterior_probs / np.sum(posterior_probs))  # 归一化得到概率
            predictions.append(self.classes[np.argmax(class_probabilities)])

        return predictions, np.array(probabilities)

naive_bayes = NaiveBayes()
naive_bayes.fit(x_train, y_train)
predictions, probabilities = naive_bayes.predict(x_test)

## 评价模型的效果（进行3次实验）

1.计算出两个模型的P R F1

In [126]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 计算 Naive Bayes 模型的精确率、召回率和 F1 分数
nb_precision = precision_score(y_test, predictions, average='macro', zero_division=0)
nb_recall = recall_score(y_test, predictions, average='macro', zero_division=0)
nb_f1 = f1_score(y_test, predictions, average='macro', zero_division=0)

# 计算 KNN 模型的精确率、召回率和 F1 分数
knn_precision = precision_score(y_test, knn_prediction, average='macro', zero_division=0)
knn_recall = recall_score(y_test, knn_prediction, average='macro', zero_division=0)
knn_f1 = f1_score(y_test, knn_prediction, average='macro', zero_division=0)

# 打印结果
print(f"Naive Bayes - Precision: {nb_precision:.4f}, Recall: {nb_recall:.4f}, F1: {nb_f1:.4f}")
print(f"KNN - Precision: {knn_precision:.4f}, Recall: {knn_recall:.4f}, F1: {knn_f1:.4f}")

Naive Bayes - Precision: 0.5127, Recall: 0.3511, F1: 0.3483
KNN - Precision: 0.1941, Recall: 0.2875, F1: 0.2294


### 2.绘制ROC曲线

In [127]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# 将字符串标签转换为二进制形式
class_mapping = {"art": 0, "literature": 1, "education": 2, "philosophy": 3, "history": 4}
y_true_binary = np.zeros((len(y_test), len(class_mapping)))
for i, l in enumerate(y_test):
    for key, value in class_mapping.items():
        if key in l:
            y_true_binary[i, value] = 1
            
knn_probabilities = np.array(knn_probabilities, dtype='object')
probabilities = np.array(probabilities, dtype='object')
print(knn_probabilities.shape)
print(probabilities.shape)

(405,)
(405, 5)


In [128]:

# 计算 KNN 模型的 ROC 曲线
fpr_model1, tpr_model1, _ = roc_curve(y_true_binary.ravel(), knn_probabilities.ravel())
roc_auc_model1 = auc(fpr_model1, tpr_model1)

# 计算 Naive Bayes 模型的 ROC 曲线
fpr_model2, tpr_model2, _ = roc_curve(y_true_binary.ravel(), probabilities.ravel())
roc_auc_model2 = auc(fpr_model2, tpr_model2)

# 绘制 ROC 曲线
plt.figure()
plt.plot(fpr_model1, tpr_model1, color='darkorange', lw=2, label='Model 1 (AUC = %0.2f)' % roc_auc_model1)
plt.plot(fpr_model2, tpr_model2, color='green', lw=2, label='Model 2 (AUC = %0.2f)' % roc_auc_model2)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [2025, 405]