In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def get_tf_idf_vectors(corpus):#获取向量矩阵
    tfidf_model = TfidfVectorizer()
    vector_list = tfidf_model.fit_transform(corpus).todense()
    return vector_list

In [3]:
corpus = [
    "Data Science is an overlap between Arts and Science",
    "Generally, Arts graduates are right-brained and Science graduates are left-brained",
    "Excelling in both Arts and Science at a time becomes difficult",
    "Natural Language Processing is a part of Data Science"
]

In [4]:
vector_list = get_tf_idf_vectors(corpus)
print(vector_list)

[[0.40332811 0.25743911 0.         0.25743911 0.         0.
  0.40332811 0.         0.         0.31798852 0.         0.
  0.         0.         0.         0.31798852 0.         0.
  0.         0.         0.40332811 0.         0.         0.
  0.42094668 0.        ]
 [0.         0.159139   0.49864399 0.159139   0.         0.
  0.         0.         0.49864399 0.         0.         0.
  0.24932199 0.49864399 0.         0.         0.         0.24932199
  0.         0.         0.         0.         0.         0.24932199
  0.13010656 0.        ]
 [0.         0.22444946 0.         0.22444946 0.35164346 0.35164346
  0.         0.35164346 0.         0.         0.35164346 0.35164346
  0.         0.         0.35164346 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.18350214 0.35164346]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.30887228 0.         0.
  0.         0.         0.         0.30887228 0.39176

In [5]:
vector_list.shape

(4, 26)

In [6]:
#有缺陷
#词袋和TF-IDF结合，先用词袋模型筛选高热度词汇，再用TF-IDF计算权值，值越高说明区分效果越好

corpus_cn = [
    "帮我 查下 明天 北京 天气 怎么样",
    "帮我 查询 去 北京 的 机票",
    "帮我 查看 到 广州 的 机票",
    "帮我 搜索 广州 长隆 在哪"
]

In [8]:
#将语料转换为词袋向量
from sklearn.feature_extraction.text import CountVectorizer

#声明一个向量化工具vectorizer
vectoerizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b') #使CountVectorizer支持单字符的词

#根据语料集统计（fit）词袋
vectoerizer.fit(corpus_cn)

#输出语料集的词袋信息
bag_of_words = vectoerizer.get_feature_names()
print("词袋:", bag_of_words)

#将语料集转化为词袋向量（transform）
X = vectoerizer.transform(corpus_cn)
print("\n")
print("语料库向量:")
print(X.toarray())

#查看每个词在词袋中的索引
print("\n")
print("'广州' 索引号: {}".format(vectoerizer.vocabulary_.get('广州')))
print("'北京' 索引号: {}".format(vectoerizer.vocabulary_.get('北京')))

词袋: ['到', '北京', '去', '在哪', '天气', '帮我', '广州', '怎么样', '搜索', '明天', '机票', '查下', '查看', '查询', '的', '长隆']


语料库向量:
[[0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0]
 [0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 0]
 [1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0]
 [0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1]]


'广州' 索引号: 6
'北京' 索引号: 1


In [9]:
#根据词袋向量统计TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

#声明一个TF-IDF转化器
tfidf_transformer = TfidfTransformer()

#根据语料集的词袋向量计算（fit）TF-IDF
tfidf_transformer.fit(X.toarray())

#输出TF-IDF信息
for idx, word in enumerate(vectoerizer.get_feature_names()):
    print("{}\t{}".format(word, tfidf_transformer.idf_[idx]))

    
#将语料集的词袋向量表示转换为TF-IDF向量表示
tfidf = tfidf_transformer.transform(X)
print("\n")
print("语料库TF-IDF矩阵:")
print(tfidf.toarray())

到	1.916290731874155
北京	1.5108256237659907
去	1.916290731874155
在哪	1.916290731874155
天气	1.916290731874155
帮我	1.0
广州	1.5108256237659907
怎么样	1.916290731874155
搜索	1.916290731874155
明天	1.916290731874155
机票	1.5108256237659907
查下	1.916290731874155
查看	1.916290731874155
查询	1.916290731874155
的	1.5108256237659907
长隆	1.916290731874155


语料库TF-IDF矩阵:
[[0.         0.3563895  0.         0.         0.45203489 0.23589056
  0.         0.45203489 0.         0.45203489 0.         0.45203489
  0.         0.         0.         0.        ]
 [0.         0.38761905 0.49164562 0.         0.         0.25656108
  0.         0.         0.         0.         0.38761905 0.
  0.         0.49164562 0.38761905 0.        ]
 [0.49164562 0.         0.         0.         0.         0.25656108
  0.38761905 0.         0.         0.         0.38761905 0.
  0.49164562 0.         0.38761905 0.        ]
 [0.         0.         0.         0.50676543 0.         0.26445122
  0.39953968 0.         0.50676543 0.         0.         0.
