In [1]:
import spacy
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Now we define a corpus, basically a collection of sentences 

corpus = [
    "Inflation surges around the world.",
    "The Omicron coronavirus variant spreads.",x
    "World population exceeds 8 billion.",
    "AI predicts protein structures."
]

In [3]:
vectorizer = CountVectorizer()

In [5]:
# The ".fit_transform" learn the vocabulary from the corpus and generates a bag of words
bow = vectorizer.fit_transform(corpus)

In [7]:
vectorizer.vocabulary_

{'inflation': 5,
 'surges': 12,
 'around': 1,
 'the': 13,
 'world': 15,
 'omicron': 6,
 'coronavirus': 3,
 'variant': 14,
 'spreads': 10,
 'population': 7,
 'exceeds': 4,
 'billion': 2,
 'ai': 0,
 'predicts': 8,
 'protein': 9,
 'structures': 11}

In [8]:
bow

<4x16 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [9]:
print(bow)

  (0, 5)	1
  (0, 12)	1
  (0, 1)	1
  (0, 13)	1
  (0, 15)	1
  (1, 13)	1
  (1, 6)	1
  (1, 3)	1
  (1, 14)	1
  (1, 10)	1
  (2, 15)	1
  (2, 7)	1
  (2, 4)	1
  (2, 2)	1
  (3, 0)	1
  (3, 8)	1
  (3, 9)	1
  (3, 11)	1


In [10]:
bow.toarray()

array([[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [18]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 682.7 kB/s eta 0:00:19
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.1/12.8 MB 409.6 kB/s eta 0:00:32
     --------------------------------------- 0.1/12.8 MB 656.4 kB/s eta 0:00:20
      -------------------------------------- 0.2/12.8 MB 908.0 kB/s eta 0:00:14
      --------------------------------------- 0.3/12.8 MB 1.0 MB/s eta 0:00:12
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:08
     - -------------------------------------- 0.6/12.8 MB 1.7 MB/s eta 0:00:08
     --- ------------------------------------ 1.1/12.8 MB 2.6 MB/s eta 0:00:05
     --- ---------------------------

In [16]:
## Now we will make our own custom tokenizer

In [20]:
# Load english language model
nlp = spacy.load("en_core_web_sm")
# Define custom tokenizer (remove stop words and punctuation and apply lemmatization)
def custom_tokenizer(doc):
    return [t.lemma_ for t in nlp(doc) if (not t.is_punct) and (not t.is_stop)]

In [23]:
# Dense matrix representation
bow.toarray()

array([[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [25]:
# Sparse slice
print(bow[:,0:4])


  (0, 1)	1
  (1, 3)	1
  (2, 2)	1
  (3, 0)	1


In [26]:
# Cosine similarity using numpy
def cosine_sim(a,b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
# Similarity between two documents
print(corpus[1])
print(corpus[3])
print(f'Similarity score: {cosine_sim(bow[1].toarray().squeeze(),bow[3].toarray().squeeze()):.3f}')

The Omicron coronavirus variant spreads.
AI predicts protein structures.
Similarity score: 0.000


In [27]:
# Similarity between two documents
print(corpus[0])
print(corpus[2])
print(f'Similarity score: {cosine_sim(bow[0].toarray().squeeze(),bow[2].toarray().squeeze()):.3f}')

Inflation surges around the world.
World population exceeds 8 billion.
Similarity score: 0.224
