In [1]:
# BLUEPRINT: BUILDING YOUR OWN VECTORISER

In [2]:
# Enumerate vocabulary
# Interested in whether a word appears in a document or not, we just enumerate the words
sentences = ["It was the best of times",
            "It was the worst of times",
            "It was the age of wisdom",
            "It was the age of foolishness"]
tokenised_sentences = [[token for token in sentence.split()] for sentence in sentences]
vocabulary = set([w for s in tokenised_sentences for w in s])
[[w,i] for i,w in enumerate(vocabulary)]

[['worst', 0],
 ['wisdom', 1],
 ['age', 2],
 ['the', 3],
 ['of', 4],
 ['times', 5],
 ['best', 6],
 ['It', 7],
 ['was', 8],
 ['foolishness', 9]]

In [3]:
# One-hot encoding
import pandas as pd
import numpy as np
def onehot_encode(tokenised_sentence, vocabulary):
    return [1 if w in tokenised_sentence else 0 for w in vocabulary]
onehot = [onehot_encode(tokenised_sentence, vocabulary) for tokenised_sentence in tokenised_sentences]
for (sentence, oh) in zip(sentences, onehot):
    print(f"{oh}: {sentence}")
onehot_df = pd.DataFrame(onehot, columns = vocabulary)
print(f"First 10 records of onehot_df:\n{onehot_df.head(10)}")
# Determine similarity between first 2 tokenised sentences
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))] # & performs bit-wise operation
print(sim)
print(f"Sum of sim: {sum(sim)}")
import numpy as np
print(np.dot(onehot[0], onehot[1]))
# Determine similarity between all tokenised sentences with second tokenised_sentence
print(np.dot(onehot, onehot[1]))

[0, 0, 0, 1, 1, 1, 1, 1, 1, 0]: It was the best of times
[1, 0, 0, 1, 1, 1, 0, 1, 1, 0]: It was the worst of times
[0, 1, 1, 1, 1, 0, 0, 1, 1, 0]: It was the age of wisdom
[0, 0, 1, 1, 1, 0, 0, 1, 1, 1]: It was the age of foolishness
First 10 records of onehot_df:
   worst  wisdom  age  the  of  times  best  It  was  foolishness
0      0       0    0    1   1      1     1   1    1            0
1      1       0    0    1   1      1     0   1    1            0
2      0       1    1    1   1      0     0   1    1            0
3      0       0    1    1   1      0     0   1    1            1
[0, 0, 0, 1, 1, 1, 0, 1, 1, 0]
Sum of sim: 5
5
[5 6 4 4]


In [4]:
# Out of vocabulary
print(onehot_encode("the age of wisdom is the best of times".split(), vocabulary))
print(onehot_encode("John likes to watch movies. Mary likes to watch movies too".split(), vocabulary))

[0, 1, 1, 1, 1, 1, 1, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [6]:
# Document term matrix
print(onehot)

[[0, 0, 0, 1, 1, 1, 1, 1, 1, 0], [1, 0, 0, 1, 1, 1, 0, 1, 1, 0], [0, 1, 1, 1, 1, 0, 0, 1, 1, 0], [0, 0, 1, 1, 1, 0, 0, 1, 1, 1]]


In [27]:
# Similarities
sim_df = np.dot(onehot, np.transpose(onehot))
print(sim_df)

[[6 5 4 4]
 [5 6 4 4]
 [4 4 6 5]
 [4 4 5 6]]


In [20]:
# Scikit-learn one-hot vectorisation
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()
lb.fit([vocabulary])
# Show classes
print(lb.classes_)
print(vocabulary) # Arrangement of tokens different between lb.classes_ and vpcabulary
lb.transform(tokenised_sentences)

['It' 'age' 'best' 'foolishness' 'of' 'the' 'times' 'was' 'wisdom' 'worst']
{'worst', 'wisdom', 'age', 'the', 'of', 'times', 'best', 'It', 'was', 'foolishness'}


array([[1, 0, 1, 0, 1, 1, 1, 1, 0, 0],
       [1, 0, 0, 0, 1, 1, 1, 1, 0, 1],
       [1, 1, 0, 0, 1, 1, 0, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1, 0, 0]])

In [28]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
# Add more sentences
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games."]
# Convert to Pandas dataframe
more_sentences_df = pd.DataFrame(more_sentences)
print(f"First 5 records of df:\n{more_sentences_df.head(5)}")
# Fit
cv.fit(more_sentences)
# Show features
print(cv.get_feature_names())
# Transform
dt = cv.transform(more_sentences)
print(dt)
# Convert to Pandas dataframe
dt_df = pd.DataFrame(dt.toarray(), columns = cv.get_feature_names())
# Show dt_df
print(f"dt_df:\n{dt_df}")
# Get similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
# Similarity between first and second documents
cosine_similarity(dt[0], dt[1])
# Store similarity matrix as Pandas dataframe
similarity_df = pd.DataFrame(cosine_similarity(dt, dt))
# Show similarity
print(f"similarity_df:\n{similarity_df}")

First 5 records of df:
                                                   0
0                           It was the best of times
1                          It was the worst of times
2                           It was the age of wisdom
3                      It was the age of foolishness
4  John likes to watch movies. Mary likes movies ...
['age', 'also', 'best', 'foolishness', 'football', 'games', 'it', 'john', 'likes', 'mary', 'movies', 'of', 'the', 'times', 'to', 'too', 'was', 'watch', 'wisdom', 'worst']
  (0, 2)	1
  (0, 6)	1
  (0, 11)	1
  (0, 12)	1
  (0, 13)	1
  (0, 16)	1
  (1, 6)	1
  (1, 11)	1
  (1, 12)	1
  (1, 13)	1
  (1, 16)	1
  (1, 19)	1
  (2, 0)	1
  (2, 6)	1
  (2, 11)	1
  (2, 12)	1
  (2, 16)	1
  (2, 18)	1
  (3, 0)	1
  (3, 3)	1
  (3, 6)	1
  (3, 11)	1
  (3, 12)	1
  (3, 16)	1
  (4, 7)	1
  (4, 8)	2
  (4, 9)	1
  (4, 10)	2
  (4, 14)	1
  (4, 15)	1
  (4, 17)	1
  (5, 1)	1
  (5, 4)	1
  (5, 5)	1
  (5, 8)	1
  (5, 9)	1
  (5, 14)	1
  (5, 17)	1
dt_df:
   age  also  best  foolishness  football