## Binary Term Frequencies

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [  'the hotel has one bad room',
            'the room of the hotel is bad',
            'one bathroom is bad, the other bathroom is good', ]

vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['bad', 'bathroom', 'good', 'has', 'hotel', 'is', 'of', 'one', 'other', 'room', 'the']
[[1 0 0 1 1 0 0 1 0 1 1]
 [1 0 0 0 1 1 1 0 0 1 1]
 [1 1 1 0 0 1 0 1 1 0 1]]


## Absolute  Term Frequencies

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [ 'jazz music has a swing rhythm', 
           'swing is hard to explain',
           'swing rhythm is a natural rhythm',]

vectorizer = CountVectorizer(binary=False)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())


['explain', 'hard', 'has', 'is', 'jazz', 'music', 'natural', 'rhythm', 'swing', 'to']
[[0 0 1 0 1 1 0 1 1 0]
 [1 1 0 1 0 0 0 0 1 1]
 [0 0 0 1 0 0 1 2 1 0]]


## Absolute TF-IDF

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'jazz music has a swing rhythm', 
    'swing is hard to explain',
    'swing rhythm is a natural rhythm',
]
vectorizer = TfidfVectorizer(norm=None)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X)

['explain', 'hard', 'has', 'is', 'jazz', 'music', 'natural', 'rhythm', 'swing', 'to']
  (0, 7)	1.2876820724517808
  (0, 8)	1.0
  (0, 2)	1.6931471805599454
  (0, 5)	1.6931471805599454
  (0, 4)	1.6931471805599454
  (1, 0)	1.6931471805599454
  (1, 9)	1.6931471805599454
  (1, 1)	1.6931471805599454
  (1, 3)	1.2876820724517808
  (1, 8)	1.0
  (2, 6)	1.6931471805599454
  (2, 3)	1.2876820724517808
  (2, 7)	2.5753641449035616
  (2, 8)	1.0


## Normalized TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'jazz music has a swing rhythm', 
    'swing is hard to explain',
    'swing rhythm is a natural rhythm',
]
vectorizer = TfidfVectorizer(norm='l1')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X)

['explain', 'hard', 'has', 'is', 'jazz', 'music', 'natural', 'rhythm', 'swing', 'to']
  (0, 7)	0.17478762945985446
  (0, 8)	0.1357381866216823
  (0, 2)	0.22982472797282108
  (0, 5)	0.22982472797282108
  (0, 4)	0.22982472797282108
  (1, 0)	0.22982472797282108
  (1, 9)	0.22982472797282108
  (1, 1)	0.22982472797282108
  (1, 3)	0.17478762945985446
  (1, 8)	0.1357381866216823
  (2, 6)	0.2582515612029269
  (2, 3)	0.19640696884585998
  (2, 7)	0.39281393769171996
  (2, 8)	0.15252753225949314


#### Print the full matrix

In [4]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in corpus]
import pandas as pd
df = pd.DataFrame(X.todense(), index=corpus_index, columns=feature_names)
print(df)

                                   explain      hard       has        is  \
jazz music has a swing rhythm     0.000000  0.000000  0.229825  0.000000   
swing is hard to explain          0.229825  0.229825  0.000000  0.174788   
swing rhythm is a natural rhythm  0.000000  0.000000  0.000000  0.196407   

                                      jazz     music   natural    rhythm  \
jazz music has a swing rhythm     0.229825  0.229825  0.000000  0.174788   
swing is hard to explain          0.000000  0.000000  0.000000  0.000000   
swing rhythm is a natural rhythm  0.000000  0.000000  0.258252  0.392814   

                                     swing        to  
jazz music has a swing rhythm     0.135738  0.000000  
swing is hard to explain          0.135738  0.229825  
swing rhythm is a natural rhythm  0.152528  0.000000  
