In [1]:
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


comp_graphics_train_path = 'comp.graphics_train'
sci_med_train_path = 'sci.med_train'
comp_graphics_test_path = 'comp.graphics_test'
sci_med_test_path = 'sci.med_test'


# 1. 读comp.graphics训练集
comp_graphics_train_texts = []
for filename in os.listdir(comp_graphics_train_path):
    with open(os.path.join(comp_graphics_train_path, filename), 'r', encoding='latin-1') as file:
        comp_graphics_train_texts.append(file.read())

# 2. 读sci.med训练集
sci_med_train_texts = []
for filename in os.listdir(sci_med_train_path):
    with open(os.path.join(sci_med_train_path, filename), 'r', encoding='latin-1') as file:
        sci_med_train_texts.append(file.read())

# 3. 读comp.graphics测试集
comp_graphics_test_texts = []
for filename in os.listdir(comp_graphics_test_path):
    with open(os.path.join(comp_graphics_test_path, filename), 'r', encoding='latin-1') as file:
        comp_graphics_test_texts.append(file.read())

# 4. 读sci.med测试集
sci_med_test_texts = []
for filename in os.listdir(sci_med_test_path):
    with open(os.path.join(sci_med_test_path, filename), 'r', encoding='latin-1') as file:
        sci_med_test_texts.append(file.read())


# 训练集标签 ———— 0：'comp.graphics'， 1：'sci.med'
comp_graphics_train_labels = [0] * len(comp_graphics_train_texts)
sci_med_train_labels = [1] * len(sci_med_train_texts)

# 测试集标签
comp_graphics_test_labels = [0] * len(comp_graphics_test_texts)
sci_med_test_labels = [1] * len(sci_med_test_texts)

# 合并训练集
train_texts = comp_graphics_train_texts + sci_med_train_texts
train_labels = comp_graphics_train_labels + sci_med_train_labels

# 合并测试集
test_texts = comp_graphics_test_texts + sci_med_test_texts
test_labels = comp_graphics_test_labels + sci_med_test_labels

# 9. CountVectorizer特征提取并TfidfTransformerTF-IDF 特征转换
count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

train_counts = count_vectorizer.fit_transform(train_texts)  # 将训练集文本转换为词频矩阵
train_tfidf = tfidf_transformer.fit_transform(train_counts)  # 训练集词频矩阵转换为TF-IDF特征矩阵
test_counts = count_vectorizer.transform(test_texts)        # 将测试集文本转换为词频矩阵
test_tfidf = tfidf_transformer.transform(test_counts)        # 测试集词频矩阵转换为TF-IDF特征矩阵

# 初始化knn_classifier
knn_classifier = KNeighborsClassifier(n_neighbors=6)

# 训练
knn_classifier.fit(train_tfidf, train_labels)

# 预测test
y_pred = knn_classifier.predict(test_tfidf)

# 性能
report = classification_report(test_labels, y_pred)
print(report)



              precision    recall  f1-score   support

           0       0.91      0.91      0.91       389
           1       0.91      0.91      0.91       396

    accuracy                           0.91       785
   macro avg       0.91      0.91      0.91       785
weighted avg       0.91      0.91      0.91       785

