[Reference](https://betterprogramming.pub/the-beginners-guide-to-similarity-matching-using-spacy-782fc2922f7c)

In [1]:
!python -m spacy download en_core_web_lg

2023-04-30 11:55:06.033237: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
# !python -m spacy download en_core_web_sm-2.1.0 --direct

In [3]:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(u"This is a sentence.")

In [5]:
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()
doc = nlp(u"This is a sentence.")

In [6]:
#assign the default stopwords list to a variable
STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

In [7]:
#nlp refers to the name of the model loaded, change the name accordingly
#nlp = en_core_web_lg.load() or nlp = spacy.load("en_core_web_lg")
print(nlp.Defaults.stop_words)

{'from', 'its', 'nine', 'part', 'may', 'now', 'herein', 'moreover', 'sixty', 'again', 'used', 'mine', 'noone', 're', 'without', 'his', 'whose', 'too', 'whenever', 'will', 'or', 'formerly', 'give', 'did', '’re', 'neither', 'quite', 'amount', 'beforehand', 'first', 'must', 'side', 'n‘t', 'to', 'nothing', 'of', 'get', 'ever', 'please', 'beyond', 'enough', 'before', 'four', 'mostly', 'such', "'d", 'something', 'fifteen', 'hers', 'both', 'during', 'been', 'go', 'they', 'be', 'who', 'their', 'anyhow', 'thus', 'since', 'move', 'toward', 'well', 'have', 'ourselves', 'using', 'for', 'also', 'do', 'about', 'out', 'below', 'further', 'off', 'whole', 'anyone', 'meanwhile', 'doing', 'ours', 'back', 'these', 'throughout', 'latterly', 'was', 'no', 'does', 'cannot', 'whereafter', 'along', '’m', 'thence', "'ll", 'behind', 'has', "n't", 'each', 'always', "'re", '’d', 'twenty', 'where', 'while', 'itself', 'than', 'down', 'due', 'hereafter', 'beside', '’s', 'seeming', 'six', 'whereas', 'myself', 'your', '

In [8]:
nlp.Defaults.stop_words.add("add")

In [9]:
nlp.Defaults.stop_words |= {"stop","word",}

In [12]:
# nlp.Defaults.stop_words.remove("remove")

In [13]:
nlp.Defaults.stop_words -= {"stop", "word"}

# Similarity matching

In [14]:
doc1 = nlp("How do I turn sound on/off?")
doc2 = nlp("How do I obtain a pet?")
doc1.similarity(doc2)

0.7107966170017302

In [15]:
doc1 = nlp("turn sound on/off")
doc2 = nlp("obtain a pet")
doc1.similarity(doc2)

0.18548565684496565

In [16]:
def remove_stopwords(text):
    doc = nlp(text.lower()) #1
    result = [] #2
    for token in doc: #3
        if token.text in nlp.Defaults.stop_words: #4
            continue
        result.append(token.text)#5
    return " ".join(result) #6

In [18]:
def remove_stopwords_fast(text):
    doc = nlp(text.lower())
    result = [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
    return " ".join(result)

In [19]:
sample = "Thanks for the cool story bro!"
%timeit remove_stopwords(sample)
%timeit remove_stopwords_fast(sample)

8.17 ms ± 937 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
9.72 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
def remove_pronoun(text):
    doc = nlp(text.lower())
    result = [token for token in doc if token.lemma_ != '-PRON-']
    return " ".join(result)

In [21]:
def remove_pronoun(text):
    doc = nlp(text.lower())
    result = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    return " ".join(result)

In [23]:
def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)

In [24]:
def calculate_similarity(text1, text2):
    base = nlp(process_text(text1))
    compare = nlp(process_text(text2))
    return base.similarity(compare)