In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np

corpus = [ "you better start swimming or you’ll sink like a stone for the times they are a-changing", 
            "the loser now will be later to win cause the times they are a-changing", 
            "it’ll soon shake your windows and rattle your walls for the times they are a-changing"]

cv = CountVectorizer().fit(corpus)
Vocab = cv.vocabulary_

#print(Vocab)

wordSort = cv.get_feature_names_out()

# Ordenar vocabulário por ordem de ocorrencia
voc = {k: v for k,v in sorted(Vocab.items(), key=lambda item: item[1])}
#print(voc)

# Trocar a chave pelo indice
voc2 = {value: key for key, value in voc.items()}
#print(voc2)


X = cv.transform(corpus)
# Não convém fazer no caso dos dados IMDB
X = X.toarray()
#print(X.shape) # 3 documentos X 31 palavras



# Quantas vezes cada palavra aparece nos documentos todos
count = np.sum(X, axis=0)
print(count)

for i in range(31): 
    print(i, voc2[i], count[i])


[1 3 1 1 1 3 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 4 3 3 1 1 1 1 1 2 2]
0 and 1
1 are 3
2 be 1
3 better 1
4 cause 1
5 changing 3
6 for 2
7 it 1
8 later 1
9 like 1
10 ll 2
11 loser 1
12 now 1
13 or 1
14 rattle 1
15 shake 1
16 sink 1
17 soon 1
18 start 1
19 stone 1
20 swimming 1
21 the 4
22 they 3
23 times 3
24 to 1
25 walls 1
26 will 1
27 win 1
28 windows 1
29 you 2
30 your 2


### TfidfVectorizer 
- Agarra no resultado do CountVectorizer (Matriz de contagens) e normaliza, de forma a que os vetores tenham norma 1 e que as palavras que aparecem em poucos documentos tenham um maior peso que as palavras que aparecem em muitos documentos

In [3]:
## Podemos aplicar com os resultados do CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

cv = CountVectorizer().fit(corpus)
X = cv.transform(corpus)

tfidf = TfidfTransformer().fit(X)
Y = tfidf.transform(X)

print(Y.toarray())


[[0.         0.15841025 0.         0.26821186 0.         0.15841025
  0.20398203 0.         0.         0.26821186 0.20398203 0.
  0.         0.26821186 0.         0.         0.26821186 0.
  0.26821186 0.26821186 0.26821186 0.15841025 0.15841025 0.15841025
  0.         0.         0.         0.         0.         0.53642372
  0.        ]
 [0.         0.17979686 0.30442255 0.         0.30442255 0.17979686
  0.         0.         0.30442255 0.         0.         0.30442255
  0.30442255 0.         0.         0.         0.         0.
  0.         0.         0.         0.35959372 0.17979686 0.17979686
  0.30442255 0.         0.30442255 0.30442255 0.         0.
  0.        ]
 [0.26821186 0.15841025 0.         0.         0.         0.15841025
  0.20398203 0.26821186 0.         0.         0.20398203 0.
  0.         0.         0.26821186 0.26821186 0.         0.26821186
  0.         0.         0.         0.15841025 0.15841025 0.15841025
  0.         0.26821186 0.         0.         0.26821186 0.


In [9]:
## Ou podemos aplicar o TfidfVectorizer diretamente com o corpus

from sklearn.feature_extraction.text import TfidfVectorizer

print(corpus)
tfidf = TfidfVectorizer().fit(corpus)
Y = tfidf.transform(corpus)

print(Y.toarray())

['you better start swimming or you’ll sink like a stone for the times they are a-changing', 'the loser now will be later to win cause the times they are a-changing', 'it’ll soon shake your windows and rattle your walls for the times they are a-changing']
[[0.         0.15841025 0.         0.26821186 0.         0.15841025
  0.20398203 0.         0.         0.26821186 0.20398203 0.
  0.         0.26821186 0.         0.         0.26821186 0.
  0.26821186 0.26821186 0.26821186 0.15841025 0.15841025 0.15841025
  0.         0.         0.         0.         0.         0.53642372
  0.        ]
 [0.         0.17979686 0.30442255 0.         0.30442255 0.17979686
  0.         0.         0.30442255 0.         0.         0.30442255
  0.30442255 0.         0.         0.         0.         0.
  0.         0.         0.         0.35959372 0.17979686 0.17979686
  0.30442255 0.         0.30442255 0.30442255 0.         0.
  0.        ]
 [0.26821186 0.15841025 0.         0.         0.         0.15841025
 