# 使用 gensim 算法包实现TF-IDF

In [13]:
from gensim import corpora

In [14]:
# 输入语料库
corpus = ['I graduated from BUPT BUPT is beijing university of posts and telecommunications',
        'I work in ByteDance',
        'I am living in guangdong province',
        'my favorite sports is playing basketball']



### 获取每个词语的 id 和词频

In [15]:
words_list = list()
for i in range(len(corpus)):
    words_list.append(corpus[i].split(' '))
print(words_list)

[['I', 'graduated', 'from', 'BUPT', 'BUPT', 'is', 'beijing', 'university', 'of', 'posts', 'and', 'telecommunications'], ['I', 'work', 'in', 'ByteDance'], ['I', 'am', 'living', 'in', 'guangdong', 'province'], ['my', 'favorite', 'sports', 'is', 'playing', 'basketball']]


In [16]:
dic = corpora.Dictionary(words_list)
# 赋给语料库中每个词(不重复的词)一个整数id
new_corpus = [dic.doc2bow(words) for words in words_list]
print(new_corpus)

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(1, 1), (11, 1), (12, 1), (13, 1)], [(1, 1), (12, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(6, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)]]


### 查看每个词语对应的 id

In [17]:
print(dic.token2id)

{'BUPT': 0, 'I': 1, 'and': 2, 'beijing': 3, 'from': 4, 'graduated': 5, 'is': 6, 'of': 7, 'posts': 8, 'telecommunications': 9, 'university': 10, 'ByteDance': 11, 'in': 12, 'work': 13, 'am': 14, 'guangdong': 15, 'living': 16, 'province': 17, 'basketball': 18, 'favorite': 19, 'my': 20, 'playing': 21, 'sports': 22}


In [18]:
# 训练模型并保存
from gensim import models
tfidf = models.TfidfModel(new_corpus)
tfidf.save("tfidf.model")
# 载入模型
tfidf = models.TfidfModel.load("tfidf.model")
# 使用这个训练好的模型得到单词的tfidf值
tfidf_vec = []
for i in range(len(corpus)):
    string = corpus[i]
    string_bow = dic.doc2bow(string.lower().split())
    string_tfidf = tfidf[string_bow]
    tfidf_vec.append(string_tfidf)
# 输出 词语id与词语tfidf值
print(tfidf_vec)

[[(2, 0.3481553119113957), (3, 0.3481553119113957), (4, 0.3481553119113957), (5, 0.3481553119113957), (6, 0.17407765595569785), (7, 0.3481553119113957), (8, 0.3481553119113957), (9, 0.3481553119113957), (10, 0.3481553119113957)], [(12, 0.4472135954999579), (13, 0.8944271909999159)], [(12, 0.24253562503633297), (14, 0.48507125007266594), (15, 0.48507125007266594), (16, 0.48507125007266594), (17, 0.48507125007266594)], [(6, 0.2182178902359924), (18, 0.4364357804719848), (19, 0.4364357804719848), (20, 0.4364357804719848), (21, 0.4364357804719848), (22, 0.4364357804719848)]]


In [20]:
# 测试一个句子
test_words = "i love basketball"
string_bow = dic.doc2bow(string.lower().split())
string_tfidf = tfidf[string_bow]
print(string_tfidf)

[(6, 0.2182178902359924), (18, 0.4364357804719848), (19, 0.4364357804719848), (20, 0.4364357804719848), (21, 0.4364357804719848), (22, 0.4364357804719848)]


这里需要注意的是，在打印 tf-idf 值的时候会发现只会显示部分词语，这是因为 gensim 会自动的去除停用词。