In [1]:
# 1. Bag of words - counting words give 0 or 1 if it occurs in a text 
# can run a for loop
docs = ["SUPERB, I AM IN LOVE IN THIS PHONE", "I hate this phone"]
words = list(set([word for doc in docs for word in doc.lower().split()]))
vectors = []
for doc in docs:
    vectors.append([1 if word in doc.lower().split() else 0 for word in words])
print("vocabulary: ", words)   
print("vectors: ", vectors)

vocabulary:  ['hate', 'this', 'i', 'in', 'love', 'phone', 'superb,', 'am']
vectors:  [[0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 1, 0, 0]]


In [2]:
# 2. using countVectorizer from sklearn
# auto removes punctuation and lowers the text
from sklearn.feature_extraction.text import CountVectorizer
# list of documents
docs = ['SUPERB, I AM IN LOVE IN THIS PHONE', 'I hate this phone']
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(docs)
print('vocabulary: ', vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(docs)
# summarize encoded vector
print('shape: ', vector.shape)
print('vectors: ', vector.toarray())

vocabulary:  {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}
shape:  (2, 7)
vectors:  [[1 0 2 1 1 1 1]
 [0 1 0 0 1 0 1]]


In [4]:
# vocabulary is index in the vector, and values are the counts after the vectorizer
# each row is from each doc in docs

In [1]:
# 3. Word Frequencies with TFID-Vectorizer - highlights words that are most interesting 
# frequent in a document but not accross documents
# tf * (idf + 1) = tf + tf * idf
# can have different weighting schemes
from sklearn.feature_extraction.text import TfidfVectorizer
# list of documents
docs = ["SUPERB, I AM IN LOVE IN THIS PHONE", 
        "I hate this phone"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(docs)
# summarize
print('vocabulary: ', vectorizer.vocabulary_)
print('idfs: ', vectorizer.idf_)
# encode document
vector = vectorizer.transform([docs[0]])
# summarize encoded vector
print('vectors: ', vector.toarray())


vocabulary:  {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}
idfs:  [1.40546511 1.40546511 1.40546511 1.40546511 1.         1.40546511
 1.        ]
vectors:  [[0.35327777 0.         0.70655553 0.35327777 0.25136004 0.35327777
  0.25136004]]


In [6]:
# again 'in' has a high TF-IDF - occurs twice in first sentence but not in the second sentence
# see formular for idf per term to see how it increases as a value is more rare across documents
# good for eliminating stopwords - because those usually appear in multiple documents

In [None]:
# vector normalization
# scikit using l2 normalization