In [2]:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(u"This is a sentence.")

In [3]:
#assign the default stopwords list to a variable
STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

In [4]:
#nlp refers to the name of the model loaded, change the name accordingly
#nlp = en_core_web_lg.load() or nlp = spacy.load("en_core_web_lg")
print(nlp.Defaults.stop_words)

{'behind', 'do', 'perhaps', 'nobody', 'though', 'enough', 'please', 'both', 'most', "'re", "'d", 'whence', 'can', 'call', 'whose', 'into', 'nevertheless', 'himself', 'forty', 'even', 'its', 'with', 'must', 'much', 'yourself', 'own', '’ll', 'others', 'too', 'move', 'fifteen', 'hereby', 'former', 'hereafter', 'beside', 'through', 'ca', 'twenty', 'which', 'everyone', 'back', 'several', 'this', 'other', 'neither', 'bottom', 'see', 'toward', 'besides', 'side', '‘m', 'every', 'became', 'afterwards', 'was', "'ll", 'there', 'except', 'various', 'an', 'two', 'i', 'how', 'under', 'anyone', 'not', 'namely', 'very', 'during', 'anyhow', 'same', 'one', 'at', 'latterly', 'wherever', 'when', 'nor', 'around', 'without', 'made', 'towards', 'off', 'yet', 'ours', 'go', 'thence', 'amount', 'between', 'in', 'formerly', 'myself', 'herein', 'n’t', 'only', '’ve', 'just', 'take', 'within', 'from', 'someone', 'thru', 'ourselves', 'seems', 'all', 'thereby', 'to', 'least', 'eleven', 'and', 'whereas', 'his', 'using

In [5]:
doc1 = nlp("How do I turn sound on/off?")
doc2 = nlp("How do I obtain a pet?")
doc1.similarity(doc2)

0.8680366536690709

In [6]:
doc1 = nlp("turn sound on/off")
doc2 = nlp("obtain a pet")
doc1.similarity(doc2)

0.49538299705127853

In [10]:
def remove_stopwords(text):
    doc = nlp(text.lower()) #1
    result = [] #2
    for token in doc: #3
        if token.text in nlp.Defaults.stop_words: #4
            continue
        result.append(token.text)#5
    return " ".join(result) #6

def remove_stopwords_fast(text):
    doc = nlp(text.lower())
    result = [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
    return " ".join(result)

In [11]:
sample = "Thanks for the cool story bro!"
%timeit remove_stopwords(sample)
%timeit remove_stopwords_fast(sample)

11.3 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
11.7 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
def remove_pronoun(text):
    doc = nlp(text.lower())
    result = [token for token in doc if token.lemma_ != '-PRON-']
    return " ".join(result)

In [15]:
def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)

In [16]:
def calculate_similarity(text1, text2):
    base = nlp(process_text(text1))
    compare = nlp(process_text(text2))
    return base.similarity(compare)

In [56]:
t1 = "What can I do if there is emergent game issue?"
t2 = "What can I do if the game stuck on loading page?"
calculate_similarity(t1, t2)

0.608717908176107

In [55]:
t1 = "What can I do if there is emergent game issue hotdog?"
t2 = "We emergent can if do game I the it issue corndog?"
print(calculate_similarity(t1, t2))

print(process_text(t1))
print(process_text(t2))

0.9146971544025219
emergent game issue hotdog
emergent game issue corndog
