# 如何求 TF-IDF

In [1]:
# 词频 TF 计算了一个单词在文档中出现的次数，它认为一个单词的重要性和它在文档中出现的次数呈正比。
# 逆向文档频率 IDF，是指一个单词在文档中的区分度。它认为一个单词出现在的文档数越少，就越能通过这个单词把该文档和其他文档区分开。IDF 越大就代表该单词的区分度越大。
# TF-IDF=TF*IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()

documents = [
    'this is the bayes document',
    'this is the second second document',
    'and the third one',
    'is this the document'
]

tfidf_matrix = tfidf_vec.fit_transform(documents)
print('不重复的词：', tfidf_vec.get_feature_names())
print('每个单词的ID:', tfidf_vec.vocabulary_)
print('每个单词的TF-IDF值：', tfidf_matrix.toarray())

不重复的词： ['and', 'bayes', 'document', 'is', 'one', 'second', 'the', 'third', 'this']
每个单词的ID: {'this': 8, 'is': 3, 'the': 6, 'bayes': 1, 'document': 2, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
每个单词的TF-IDF值： [[0.         0.63314609 0.40412895 0.40412895 0.         0.
  0.33040189 0.         0.40412895]
 [0.         0.         0.27230147 0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.         0.52210862 0.52210862 0.         0.
  0.42685801 0.         0.52210862]]


# 如何对文档进行分类

## 1. 准备阶段

In [2]:
# 加载文档-> 对文档分词 -> 加载停用词 -> 计算单词权重

In [3]:
# 加载文档
import os
def load_file(file_dir, label):
    file_list = os.listdir(file_dir)
    labels_list = []
    words_list = []
    for file in file_list:
        file_path = file_dir + '/' + file
        words_list.append(cut_words(file_path))
        labels_list.append(label)
    return words_list, labels_list

In [4]:
# 对文档分词
import jieba

def cut_words(file_path):
    text_with_spaces = ''
    text = open(file_path,'r',encoding = 'gb18030').read()
    textcut = jieba.cut(text)
    for word in textcut:
        text_with_spaces += word + ' '
    return text_with_spaces

In [5]:
# 加载停用词
stop_words = open('./text_classification-master/text classification/stop/stopword.txt').read()
stop_words = stop_words.encode('utf-8').decode('utf-8-sig')
stop_words = stop_words.split('\n')

In [6]:
# 计算单词权重
tf = TfidfVectorizer(stop_words=stop_words, max_df=0.5)

In [7]:
# 导入训练数据
train_words_list1, train_labels1 = load_file('./text_classification-master/text classification/train/女性', '女性')
train_words_list2, train_labels2 = load_file('./text_classification-master/text classification/train/体育', '体育')
train_words_list3, train_labels3 = load_file('./text_classification-master/text classification/train/文学', '文学')
train_words_list4, train_labels4 = load_file('./text_classification-master/text classification/train/校园', '校园')

train_words_list = train_words_list1 + train_words_list2 + train_words_list3 + train_words_list4
train_labels = train_labels1 + train_labels2 + train_labels3 + train_labels4

# 导入测试数据
test_words_list1, test_labels1 = load_file('./text_classification-master/text classification/test/女性', '女性')
test_words_list2, test_labels2 = load_file('./text_classification-master/text classification/test/体育', '体育')
test_words_list3, test_labels3 = load_file('./text_classification-master/text classification/test/文学', '文学')
test_words_list4, test_labels4 = load_file('./text_classification-master/text classification/test/校园', '校园')

test_words_list = test_words_list1 + test_words_list2 + test_words_list3 + test_words_list4
test_labels = test_labels1 + test_labels2 + test_labels3 + test_labels4

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\gke1sgh\AppData\Local\Temp\jieba.cache
Loading model cost 0.648 seconds.
Prefix dict has been built successfully.


## 2. 分类阶段

In [8]:
# 生成分类器 -> 分类器做预测 -> 计算准确率

In [9]:
# 生成朴素贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB

train_features = tf.fit_transform(train_words_list)
clf = MultinomialNB(alpha = 0.001).fit(train_features, train_labels)



In [10]:
# 分类器做预测
test_features = tf.transform(test_words_list)
predicted_labels = clf.predict(test_features)

In [11]:
# 计算准确率
from sklearn import metrics
print('准确率为：', metrics.accuracy_score(test_labels, predicted_labels))

准确率为： 0.91
