# Stopwords

In [None]:
"""

Words like "a" and "the" appear so frequently that they don't require
tagging as thoroughly as nouns , verbs and modifiers.

These are stopwords and they can be filtered from the text to be
processed.

Stop words are just common words that don't provide any additional
information, and sometimes they can actually damage NLP analysis.

Spacy holds a  built in list of some 326 English stop words.



"""

In [1]:
import spacy

In [3]:
nlp=spacy.load('en_core_web_sm')

# English core web small language

In [6]:
stop_words = nlp.Defaults.stop_words
stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [7]:
len(stop_words)

326

In [8]:
# check if word in stop words list or not

nlp.vocab['is'].is_stop

True

In [9]:

nlp.vocab['are'].is_stop

True

In [11]:

nlp.vocab['malav'].is_stop

False

In [12]:
nlp.vocab['joshi'].is_stop

False

In [None]:
# add stop word in stop words list

# When working with text data, like text messages, a lot of us
# use BTW for by the way
# SO may be we want to consider this as stop word for a particular dataset.



In [13]:
nlp.Defaults.stop_words.add('btw')



In [16]:
nlp.vocab['btw'].is_stop

True

In [17]:
nlp.vocab['btw'].is_stop=True

In [18]:
len(nlp.Defaults.stop_words)

# earlier it was 326 now its 327

327

In [19]:
# Remove stop word

nlp.Defaults.stop_words.remove('beyond')


In [20]:
nlp.vocab['beyond'].is_stop

False

In [21]:
nlp.vocab['beyond'].is_stop=False

In [22]:
len(nlp.Defaults.stop_words)

326