# 1. 实例化TfidfVectorizer

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
tv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

# 2. 参数

- 不计算idf

In [2]:
texts = ["Chinese Beijing Chinese",
         "Chinese Chinese Shanghai",
         "Chinese Macao",
         "Tokyo Japan Chinese"]
tv = TfidfVectorizer(use_idf=False, smooth_idf=None, norm=None)
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[1., 2., 0., 0., 0., 0.],
       [0., 2., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 1.]])

- 向量归一化

In [3]:
tv = TfidfVectorizer(use_idf=False, smooth_idf=False, norm='l2')
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[0.4472136 , 0.89442719, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.89442719, 0.        , 0.        , 0.4472136 ,
        0.        ],
       [0.        , 0.70710678, 0.        , 0.70710678, 0.        ,
        0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.        ,
        0.57735027]])

In [4]:
from math import log
from math import pow
import numpy as np
(1.0/np.sqrt(pow(1.0,2)+pow(2.0,2)))*np.array([1., 2., 0., 0., 0., 0.])

array([0.4472136 , 0.89442719, 0.        , 0.        , 0.        ,
       0.        ])

- 计算idf，不加平滑

In [5]:
tv = TfidfVectorizer(use_idf=True, smooth_idf=False, norm=None)
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[2.38629436, 2.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 2.        , 0.        , 0.        , 2.38629436,
        0.        ],
       [0.        , 1.        , 0.        , 2.38629436, 0.        ,
        0.        ],
       [0.        , 1.        , 2.38629436, 0.        , 0.        ,
        2.38629436]])

In [6]:
1.0*(1+log(4/1))

2.386294361119891

In [7]:
2.0*(1+log(4/4))

2.0

- 计算idf，加入平滑

In [8]:
tv = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[1.91629073, 2.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 2.        , 0.        , 0.        , 1.91629073,
        0.        ],
       [0.        , 1.        , 0.        , 1.91629073, 0.        ,
        0.        ],
       [0.        , 1.        , 1.91629073, 0.        , 0.        ,
        1.91629073]])

In [9]:
1.0*(1+log((4+1)/(1+1)))

1.916290731874155

In [10]:
 2.0*(1+log((4+1)/(4+1)))

2.0

# 3. 方法

- inverse_transform(X)：返回某篇训练文档向量中的非0特征值所对应的特征词

In [11]:
tv = TfidfVectorizer()
tv_fit = tv.fit_transform(texts)
tv.inverse_transform(tv_fit[0])

[array(['chinese', 'beijing'], dtype='<U8')]