# Extracting Information from Text Data Assignment

In [2]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [6]:
#!pip install gensim

<IPython.core.display.Javascript object>

In [7]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

<IPython.core.display.Javascript object>

In [8]:
#!pip install textacy --quiet
#!pip install rake_nltk --quiet

<IPython.core.display.Javascript object>

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [9]:
PATH = "cnn_lite_corpus/"
DOC_PATTERN = r".*\.txt"
cnn_corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)



<IPython.core.display.Javascript object>

In [26]:
import pandas as pd

<IPython.core.display.Javascript object>

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [28]:
docs_raw = [cnn_corpus.raw(file_id) for file_id in cnn_corpus.fileids()]
docs_df = pd.DataFrame({"doc": docs_raw})
docs = " ".join(docs_raw)

<IPython.core.display.Javascript object>

In [29]:
docs_df

Unnamed: 0,doc
0,(CNN) - Pink has been working pretty hard and ...
1,(CNN) - Former Massachusetts Gov. Deval Patric...
2,(CNN) - There's a 10-week-old puppy in Missour...
3,(CNN) - Three Democratic heavyweights this wee...
4,(CNN) - The House Intelligence Committee opene...
5,Editor's Note: Nadine Jolie Courtney is the au...
6,New York (CNN Business) - A version of this a...
7,(CNN) - The manhunt for a Marine deserter want...
8,(CNN) - The Trump Organization is considering ...
9,(CNN) - Alicia Keys is hosting the Grammy Awar...


<IPython.core.display.Javascript object>

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [31]:
r = Rake()
r.extract_keywords_from_text(docs)
key_phrases = r.get_ranked_phrases_with_scores()

<IPython.core.display.Javascript object>

In [34]:
key_phrases[:5]

[(59.37208055636047,
  'others made jokes — christian whiton said witnesses bill taylor'),
 (58.89682539682539,
  'former new york city mayor michael bloomberg stepped forward last week'),
 (54.44242424242424, 'posted threats please notify us asap ," murakami wrote'),
 (54.17657342657343,
  'white house sends condolences vice president mike pence gave'),
 (52.570512820512825,
  'former white house homeland security adviser tom bossert summed')]

<IPython.core.display.Javascript object>

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [60]:
def rake_implement(x, r):
    r.extract_keywords_from_text(x)
    return r.get_ranked_phrases_with_scores()

<IPython.core.display.Javascript object>

In [61]:
docs_df["key phrases"] = docs_df["doc"].apply(lambda x: rake_implement(x, r))

<IPython.core.display.Javascript object>

In [64]:
for phrases in docs_df["key phrases"]:
    print(phrases[:3])

[(32.5, 'country music association awards red carpet'), (17.5, 'school soon ," pink said'), (16.5, 'country star chris stapleton')]
[(94.03333333333333, 'former new york city mayor michael bloomberg stepped forward last week'), (30.541666666666664, 'patrick could seize upon potential advantages'), (30.0, 'elections process would ultimately splash back')]
[(28.166666666666668, 'little magical furry unicorn ," according'), (18.5, 'dog rescue nonprofit organization mac'), (14.5, 'rescue workers speculate may')]
[(41.03333333333333, 'current 2020 democrats -- clinton told bbc radio'), (37.58333333333333, 'former new york mayor michael bloomberg made'), (29.0, '2016 democratic nominee playfully tweeted back')]
[(69.33333333333334, 'money ," tweeted white house press secretary stephanie grisham'), (67.66666666666667, 'former white house homeland security adviser tom bossert summed'), (63.416666666666664, 'former national security council russia expert fiona hill said')]
[(20.77777777777778, 

<IPython.core.display.Javascript object>

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [18]:
#!python -m spacy download en_core_web_sm
["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]


<IPython.core.display.Javascript object>

In [47]:
nlp = spacy.load("en_core_web_sm")
spacy_doc = nlp(docs)
spacy_docs = [nlp(doc) for doc in docs_raw]
spacy_docs_df = pd.DataFrame({"doc": spacy_docs})

<IPython.core.display.Javascript object>

In [48]:
spacy_docs_df.head()

Unnamed: 0,doc
0,"((, CNN, ), -, Pink, has, been, working, prett..."
1,"((, CNN, ), -, Former, Massachusetts, Gov., De..."
2,"((, CNN, ), -, There, 's, a, 10-week, -, old, ..."
3,"((, CNN, ), -, Three, Democratic, heavyweights..."
4,"((, CNN, ), -, The, House, Intelligence, Commi..."


<IPython.core.display.Javascript object>

In [90]:
def get_non_num_entity(x):
    filter_list = [
        "DATE",
        "TIME",
        "PERCENT",
        "MONEY",
        "QUANTITY",
        "ORDINAL",
        "CARDINAL",
    ]
    entities = []
    for entity in x.ents:
        if entity.label_ not in filter_list:
            entities.append([entity.text, entity.label_])
    return entities

<IPython.core.display.Javascript object>

In [91]:
spacy_docs_df["ents"] = spacy_docs_df["doc"].apply(lambda x: get_non_num_entity(x))

<IPython.core.display.Javascript object>

In [93]:
spacy_docs_df["ents"]

0     [[CNN, ORG], [Entertainment Tonight, WORK_OF_A...
1     [[CNN, ORG], [Massachusetts, GPE], [Deval Patr...
2     [[CNN, ORG], [Missouri, GPE], [Narwhal, PERSON...
3     [[CNN, ORG], [Democratic, NORP], [White House,...
4     [[CNN, ORG], [The House Intelligence Committee...
5     [[Jolie Courtney, PERSON], [All-American Musli...
6     [[New York, GPE], [CNN, ORG], [Trump, PERSON],...
7     [[CNN, ORG], [Marine, NORP], [Roanoke, NORP], ...
8     [[CNN, ORG], [The Trump Organization, ORG], [W...
9     [[CNN, ORG], [Alicia Keys, PERSON], [Grammy Aw...
10    [[Paul Callan, PERSON], [CNN, ORG], [New York,...
11    [[Los Angeles, GPE], [CNN, ORG], [Southern Cal...
12    [[CNN, ORG], [Jim Crow, PERSON], [Caucasian, N...
13    [[Hong Kong, GPE], [CNN, ORG], [Hong Kong, GPE...
14    [[Los Angeles, GPE], [CNN, ORG], [Southern Cal...
Name: ents, dtype: object

<IPython.core.display.Javascript object>

In [24]:
from spacy import displacy

displacy.render(spacy_doc, style="ent")

<IPython.core.display.Javascript object>

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.