In [2]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [3]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [4]:
len(nlp.Defaults.stop_words)

326

In [5]:
nlp.vocab['is'].is_stop

True

In [6]:
nlp.vocab['the'].is_stop

True

In [7]:
nlp.vocab['Data'].is_stop

False

### <b/> Defining own stop words

In [9]:
nlp.Defaults.stop_words.add('i.e')

In [10]:
nlp.vocab['i.e'].is_stop = True

In [11]:
len(nlp.Defaults.stop_words)

327

## <b/> Removing Custom Words from the list of Stop Words

In [12]:
nlp.vocab['i.e'].is_stop

True

In [13]:
nlp.Defaults.stop_words.remove('i.e')

In [15]:
nlp.vocab['i.e'].is_stop = False

In [16]:
nlp.vocab['i.e'].is_stop

False

In [17]:
len(nlp.Defaults.stop_words)

326

## <b/>Removing Stopwords from Corpus

In [21]:
s='''
Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions. Data is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs. 

It is a process, not an event. It is the process of using data to understand too many different things, to understand the world. Let Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data. 

It is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story. So use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution. 

We can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the data is unstructured or structured. 
The definition and the name came up in the 1980s and 1990s when some professors, IT Professionals, scientists were looking into the statistics curriculum, and they thought it would be better to call it data science and then later on data analytics derived. 
'''

txt = s.replace('\n', '')
txt = txt.strip()


In [22]:
corp = nlp(txt)
corp

Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions. Data is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs. It is a process, not an event. It is the process of using data to understand too many different things, to understand the world. Let Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data. It is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story. So use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution. We can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the data is unstructured or

## Finding stopwords from the corpus

In [26]:
stop_words = set()

for token in corp:
    if token.is_stop:
        stop_words.add(token.text)

print(stop_words)
print(len(stop_words))

{'were', 'or', 'also', 'We', 'a', 'them', '’re', 'then', '’s', 'some', 'would', 'call', 'the', 'The', 'they', 'It', 'is', 'various', 'has', 'when', 'that', 'on', 'be', 'can', 'So', 'using', 'into', 'not', 'have', 'many', 'these', 'too', 'with', 'from', 'we', 'for', 'are', 'if', 'IT', 'about', 'an', 'as', 'whether', 'and', 'to', 'name', 'your', 'you', 'behind', 'up', 'in', 'make', 'of', 'And', 'it'}
55


## Words that isn't a stopword from corpus

In [28]:
not_stop_words = []

for token in corp:
    if not token.is_stop:
        not_stop_words.append(token.text)

print(not_stop_words)

['Data', 'science', 'study', 'data', '.', 'Like', 'biological', 'sciences', 'study', 'biology', ',', 'physical', 'sciences', ',', 'study', 'physical', 'reactions', '.', 'Data', 'real', ',', 'data', 'real', 'properties', ',', 'need', 'study', 'going', 'work', '.', 'Data', 'Science', 'involves', 'data', 'signs', '.', 'process', ',', 'event', '.', 'process', 'data', 'understand', 'different', 'things', ',', 'understand', 'world', '.', 'Let', 'Suppose', 'model', 'proposed', 'explanation', 'problem', ',', 'try', 'validate', 'proposed', 'explanation', 'model', 'data', '.', 'skill', 'unfolding', 'insights', 'trends', 'hiding', '(', 'abstract', ')', 'data', '.', 'translate', 'data', 'story', '.', 'use', 'storytelling', 'generate', 'insight', '.', 'insights', ',', 'strategic', 'choices', 'company', 'institution', '.', 'define', 'data', 'science', 'field', 'processes', 'systems', 'extract', 'data', 'forms', 'resources', 'data', 'unstructured', 'structured', '.', 'definition', 'came', '1980s', '1

In [33]:
' '.join([token.text for token in corp if not token.is_stop])

'Data science study data . Like biological sciences study biology , physical sciences , study physical reactions . Data real , data real properties , need study going work . Data Science involves data signs . process , event . process data understand different things , understand world . Let Suppose model proposed explanation problem , try validate proposed explanation model data . skill unfolding insights trends hiding ( abstract ) data . translate data story . use storytelling generate insight . insights , strategic choices company institution . define data science field processes systems extract data forms resources data unstructured structured . definition came 1980s 1990s professors , Professionals , scientists looking statistics curriculum , thought better data science later data analytics derived .'