# TF-IDF
词在整个语料库中的普遍重要性。TF-IDF越高，表示词对于该文档越重要。

In [1]:
import math

In [2]:
# 文档集合
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [3]:
# 分词并构建词汇表
word_set = set()
for doc in documents:
    words = doc.lower().split()
    word_set.update(words)

word_list = list(word_set)
word_list.sort()

In [4]:
# 构建文档-词频矩阵
word_doc_freq = {word: 0 for word in word_list}
for doc in documents:
    words = doc.lower().split()
    for word in set(words):
        word_doc_freq[word] += 1

In [5]:
# 计算TF和IDF
tfidf_matrix = []
num_docs = len(documents)
for doc in documents:
    words = doc.lower().split()
    word_freq = {word: 0 for word in word_list}
    for word in words:
        word_freq[word] += 1
    
    tfidf_vector = []
    for word in word_list:
        # 计算TF
        tf = word_freq[word] / len(words)
        # 计算IDF
        idf = math.log(num_docs / (1 + word_doc_freq[word]))
        # 计算TF-IDF
        tfidf = tf * idf
        tfidf_vector.append(tfidf)
    
    tfidf_matrix.append(tfidf_vector)

In [6]:
# 输出TF-IDF矩阵
for i, doc_vector in enumerate(tfidf_matrix):
    print(f"Document {i+1} TF-IDF Vector: {doc_vector}")


Document 1 TF-IDF Vector: [0.0, 0.0, 0.05753641449035617, 0.0, 0.05753641449035617, -0.044628710262841945, 0.0, 0.0, -0.044628710262841945, 0.0, -0.044628710262841945]
Document 2 TF-IDF Vector: [0.0, 0.11552453009332421, 0.04794701207529681, 0.0, 0.0, -0.03719059188570162, 0.0, 0.11552453009332421, -0.03719059188570162, 0.0, -0.03719059188570162]
Document 3 TF-IDF Vector: [0.11552453009332421, 0.0, 0.0, 0.0, 0.0, -0.03719059188570162, 0.11552453009332421, 0.0, -0.03719059188570162, 0.11552453009332421, -0.03719059188570162]
Document 4 TF-IDF Vector: [0.0, 0.0, 0.0, 0.13862943611198905, 0.05753641449035617, -0.044628710262841945, 0.0, 0.0, -0.044628710262841945, 0.0, -0.044628710262841945]
