In [1]:
import nltk.tokenize as tk

In [2]:
doc = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

In [3]:
# 把样本按句子进行拆分  sent_list:句子列表
sent_list = tk.sent_tokenize(doc)

# 把样本按单词进行拆分  word_list:单词列表
word_list = tk.word_tokenize(doc)

In [4]:
sent_list

['Are you curious about tokenization?',
 "Let's see how it works!",
 'We need to analyze a couple of sentences with punctuations to see it in action.']

In [5]:
word_list

['Are',
 'you',
 'curious',
 'about',
 'tokenization',
 '?',
 'Let',
 "'s",
 'see',
 'how',
 'it',
 'works',
 '!',
 'We',
 'need',
 'to',
 'analyze',
 'a',
 'couple',
 'of',
 'sentences',
 'with',
 'punctuations',
 'to',
 'see',
 'it',
 'in',
 'action',
 '.']

In [6]:
tokenizer = tk.WordPunctTokenizer()
words = tokenizer.tokenize(doc)
words

['Are',
 'you',
 'curious',
 'about',
 'tokenization',
 '?',
 'Let',
 "'",
 's',
 'see',
 'how',
 'it',
 'works',
 '!',
 'We',
 'need',
 'to',
 'analyze',
 'a',
 'couple',
 'of',
 'sentences',
 'with',
 'punctuations',
 'to',
 'see',
 'it',
 'in',
 'action',
 '.']

In [7]:
#词袋模型

In [8]:
import nltk.tokenize as tk  #分词
import sklearn.feature_extraction.text as ft  #文本特征提取

In [9]:
doc = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

In [10]:
sents = tk.sent_tokenize(doc)

In [11]:
cv = ft.CountVectorizer()

In [12]:
res = cv.fit_transform(sents).toarray()

In [13]:
print(res)

[[1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0]
 [0 1 1 0 1 0 0 1 1 0 1 1 1 1 1 2 0 1 1 0 0]]


In [14]:
cv.get_feature_names()



['about',
 'action',
 'analyze',
 'are',
 'couple',
 'curious',
 'how',
 'in',
 'it',
 'let',
 'need',
 'of',
 'punctuations',
 'see',
 'sentences',
 'to',
 'tokenization',
 'we',
 'with',
 'works',
 'you']

In [16]:
#词频，对词袋模型进行归一化
import sklearn.preprocessing as sp

In [17]:
sp.normalize(res, norm='l1')

array([[0.2       , 0.        , 0.        , 0.2       , 0.        ,
        0.2       , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.2       , 0.        , 0.        , 0.        ,
        0.2       ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.2       , 0.        , 0.2       , 0.2       ,
        0.        , 0.        , 0.        , 0.2       , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.2       ,
        0.        ],
       [0.        , 0.07142857, 0.07142857, 0.        , 0.07142857,
        0.        , 0.        , 0.07142857, 0.07142857, 0.        ,
        0.07142857, 0.07142857, 0.07142857, 0.07142857, 0.07142857,
        0.14285714, 0.        , 0.07142857, 0.07142857, 0.        ,
        0.        ]])

In [14]:
#TF-IDF


In [18]:
#
cv = ft.CountVectorizer()
bow = cv.fit_transform(sents)

In [20]:
tt = ft.TfidfTransformer()
res = tt.fit_transform(bow)

In [24]:
print(res.toarray())

[[0.4472136  0.         0.         0.4472136  0.         0.4472136
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.4472136  0.
  0.         0.         0.4472136 ]
 [0.         0.         0.         0.         0.         0.
  0.49047908 0.         0.37302199 0.49047908 0.         0.
  0.         0.37302199 0.         0.         0.         0.
  0.         0.49047908 0.        ]
 [0.         0.25685987 0.25685987 0.         0.25685987 0.
  0.         0.25685987 0.19534855 0.         0.25685987 0.25685987
  0.25685987 0.19534855 0.25685987 0.51371974 0.         0.25685987
  0.25685987 0.         0.        ]]


In [26]:
cv.get_feature_names()



['about',
 'action',
 'analyze',
 'are',
 'couple',
 'curious',
 'how',
 'in',
 'it',
 'let',
 'need',
 'of',
 'punctuations',
 'see',
 'sentences',
 'to',
 'tokenization',
 'we',
 'with',
 'works',
 'you']