In [None]:
from google.colab import drive
drive.mount('/content/drive')

# One-hot Encoding

In [1]:
sentences = [
    "It was the best of times",
    "it was the worst of times",
    "it was the age of wisdom",
    "it was the age of foolishness"
]

Tokenize all sentences.

In [2]:
tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
tokenized_sentences

[['It', 'was', 'the', 'best', 'of', 'times'],
 ['it', 'was', 'the', 'worst', 'of', 'times'],
 ['it', 'was', 'the', 'age', 'of', 'wisdom'],
 ['it', 'was', 'the', 'age', 'of', 'foolishness']]

Create a vocabulary containing unique words from all sentences.

In [3]:
vocabulary = set([w for s in tokenized_sentences for w in s])
vocabulary

{'It',
 'age',
 'best',
 'foolishness',
 'it',
 'of',
 'the',
 'times',
 'was',
 'wisdom',
 'worst'}

Encode each token in a sentence by assigning 1 if the token is present in a sentence, else assigning 0.

In [4]:
def onehot_encoder(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

In [5]:
onehot = [onehot_encoder(tokenized_sentence)
          for tokenized_sentence in tokenized_sentences]
for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0]: It was the best of times
[1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0]: it was the worst of times
[1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1]: it was the age of wisdom
[1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1]: it was the age of foolishness


### Out-of-vocabulary documents

#### All tokens are known

In [6]:
onehot_encoder("the age of wisdom is the best of times".split())

[0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1]

#### Some tokens are not known

This could be a problem...

In [7]:
onehot_encoder("John likes to watch movies".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# Bag-of-Words Models

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [None]:
more_sentences = sentences + \
                 ["John likes to watch movies. Mary likes movies too.",
                  "Mary also like to watch football games."]
more_sentences

In [None]:
cv.fit(more_sentences)
print(cv.get_feature_names_out())

In [None]:
dt = cv.transform(more_sentences)
dt

In [None]:
import pandas as pd
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

Another example:

“Oh, honey, I would walk through fire for you”

“Just let me adore you”

“Like it is the only thing I will ever do”


In [None]:
another_example = ["Oh, honey, I would walk through fire for you",
                   "Just let me adore you",
                   "Like it is the only thing I will ever do"]

Creating a vocabulary.

In [None]:
tokenized2 = [[t for t in sentence.split()] for sentence in another_example]

vocabulary2 = set([w.lower() for s in tokenized2 for w in s])
vocabulary2

In [None]:
cv = CountVectorizer(stop_words=[], vocabulary=vocabulary2).fit(another_example)
print(cv.get_feature_names_out())

In [None]:
len(cv.get_feature_names_out())

In [None]:
dt_full = cv.transform(another_example)
pd.DataFrame(dt_full.toarray(), columns=cv.get_feature_names_out())

Let's also apply stopwords removal.

In [None]:
cv = CountVectorizer(stop_words='english').fit(another_example)
print(cv.get_feature_names_out())

In [None]:
len(cv.get_feature_names_out())

In [None]:
dt = cv.transform(another_example)
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

# Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
pd.DataFrame(cosine_similarity(dt))