## When should I not remove stop words?

- Sentiment Analysis
- Language Translation
- Chat bot
- Q&A system



## Practice

In [None]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [None]:
nlp = spacy.load('en_core_web_sm')

doc = nlp('We just opened one wings, the flying part is coming soon') # tokanization

for token in doc:
  if token.is_stop: # "is_stop()" ----> is_stopword
    print(token)

We
just
one
the
part
is


In [None]:
def preprocess(text):
  doc = nlp(text)
  no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct] # "is_punct()" -----> is_punctuation()

  return " ".join(no_stop_words)


In [None]:
preprocess('We just opened one wings, the flying part is coming soon')

'opened wings flying coming soon'

In [None]:
preprocess("The other is not other but your divine brother")

'divine brother'

In [None]:
preprocess("Musk wants time to prepare for a trial over his")

'Musk wants time prepare trial'

In [None]:
import pandas as pd

df = pd.read_json('doj_press.json', lines=True)

df.shape

(13087, 6)

In [None]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


### removing all the data with no topics

In [None]:
df = df[df['topics'].str.len() != 0]

In [None]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [None]:
df.shape

(4688, 6)

In [None]:
df = df.head(100)

In [None]:
df['contents']

Unnamed: 0,contents
4,"The U.S. Department of Justice, the U.S. Envir..."
7,A 131-count criminal indictment was unsealed t...
19,The United States Attorney’s Office for the Mi...
22,"21st Century Oncology LLC, has agreed to pay $..."
23,21st Century Oncology Inc. and certain of its ...
...,...
316,Doctor Hid Millions in Secret Accounts in Pana...
318,Defendant Concealed Bank Accounts in Panama an...
321,An Alaskan couple was charged in federal court...
322,A husband and wife pleaded guilty yesterday to...


In [None]:
df['contents'].iloc[4]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations regarding the company’s use of electronic health records software and separate allegations that they violated the False Claims Act by submitting, or causing the submission of, claims for certain services provided pursuant to referrals from physicians with whom they had improper financial relationships. \xa0 “The Justice Department is committed to zealously investigating improper financial relationships that have the potential to compromise physicians’ medical judgment,” said Acting Assistant Attorney General Chad A. Readler of the Justice Department’s Civil Division.\xa0 “However, we will work with companies that accept responsibility for their past compliance failures and promptly take corrective action.”  \xa0 21st Century Oncology, which is headquartered in Fort Myers, Florida, owns a

In [None]:
len(df['contents'].iloc[4])

5504

In [None]:
df['contains_new'] = df['contents'].apply(preprocess)

In [None]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contains_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unsealed today B...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agreed pay $ 19.75 m...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...


In [None]:
df['contains_new'].iloc[4][:300]

'21st Century Oncology Inc. certain subsidiaries affiliates agreed pay $ 26 million government resolve self disclosure relating submission false attestations company use electronic health records software separate allegations violated False Claims Act submitting causing submission claims certain serv'