In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(
    '/content/drive/MyDrive/articles1.csv')

In [None]:
df.shape

(50000, 10)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


### News Sources:

In [None]:
sources = df['publication'].unique()
sources

array(['New York Times', 'Breitbart', 'CNN', 'Business Insider',
       'Atlantic'], dtype=object)

### Selecting first 1000 publications of New York Times:

In [None]:
req = df['publication'].isin(
    ['New York Times']
)
content = df.loc[req,:]['content'][:1000]
content.shape

(1000,)

In [None]:
content.head()

0    WASHINGTON  —   Congressional Republicans have...
1    After the bullet shells get counted, the blood...
2    When Walt Disney’s “Bambi” opened in 1942, cri...
3    Death may be the great equalizer, but it isn’t...
4    SEOUL, South Korea  —   North Korea’s leader, ...
Name: content, dtype: object

In [None]:
for article in content[:2]:
    print(article)

WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been d

## Information Extraction with SpaCy:

In [None]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

In [None]:
def get_entities(data_frame):
    named_entities = {}
    processed_docs = []

    for item in data_frame:
        doc = nlp(item)
        processed_docs.append(doc)

        for ent in doc.ents:
            entity_text = ent.text
            entity_type = str(ent.label_)
            current_ents = {}

            if entity_type in named_entities.keys():
                current_ents = named_entities.get(entity_type)
            current_ents[entity_text] = current_ents.get(
                entity_text,0) + 1
            named_entities[entity_type] = current_ents
    return named_entities, processed_docs


In [None]:
named_entities, processed_docs = get_entities(content)

In [None]:
def print_results(named_entities):
    for key in named_entities.keys():
        print(key)

        entities = named_entities.get(key)
        sorted_keys = sorted(
            entities, key=entities.get, reverse=True)
        for item in sorted_keys[:10]:
            if (entities.get(item)>1):
                print('  '+item+': '+\
                      str(entities.get(item)))


In [None]:
print_results(named_entities)

GPE
  the United States: 1141
  Russia: 526
  China: 514
  Washington: 503
  New York: 385
  America: 356
  Iran: 294
  Mexico: 266
  Britain: 237
  California: 206
NORP
  American: 980
  Republicans: 523
  Republican: 473
  Democrats: 398
  Russian: 337
  Chinese: 288
  Americans: 267
  British: 180
  Democrat: 166
  Muslim: 164
PERSON
  Trump: 3634
  Obama: 839
  Clinton: 186
  Spicer: 134
  Donald J. Trump: 128
  Hillary Clinton: 123
  Sessions: 123
  Gorsuch: 116
  Barack Obama: 115
  Kushner: 110
ORG
  Trump: 768
  Senate: 373
  Congress: 344
  Twitter: 310
  White House: 235
  The New York Times: 230
  the White House: 223
  Times: 211
  House: 207
  Google: 134
MONEY
  1: 66
  2: 23
  10: 19
  millions of dollars: 19
  100: 18
  3: 18
  billions of dollars: 17
  5: 16
  4: 15
  $1 billion: 14
CARDINAL
  one: 1382
  two: 910
  000: 591
  three: 349
  One: 338
  four: 172
  seven: 170
  1: 155
  five: 131
  2: 118
DATE
  Friday: 428
  Wednesday: 350
  Tuesday: 324
  2015: 274
  Th

In [None]:
rows = []
rows.append(['Type:', 'Entries:', 'Total:'])
for ent_type in named_entities.keys():
    rows.append([ent_type, str(len(
        named_entities.get(ent_type)
    )), str(sum(named_entities.get(
        ent_type
    ).values()))])
columns = zip(*rows)
column_widths = [max(len(item) for item in col)\
                 for col in columns]
for row in rows:
    print(''.join(
        ' {:{width}} '.format(row[i],\
                              width=column_widths[i])
    for i in range(0,len(row))))

 Type:        Entries:  Total: 
 GPE          1760      15100  
 NORP         541       7525   
 PERSON       10000     30268  
 ORG          4893      15215  
 MONEY        681       1239   
 CARDINAL     1216      9097   
 DATE         3107      15117  
 LAW          129       412    
 LOC          455       1462   
 ORDINAL      69        1736   
 TIME         587       1614   
 FAC          548       1060   
 QUANTITY     308       358    
 PERCENT      268       658    
 EVENT        230       562    
 PRODUCT      294       537    
 WORK_OF_ART  1322      1951   
 LANGUAGE     17        94     


In [None]:
entity = 'The New York Times'
sentences = ['The New York Times wrote about apple']

def extract_span(sent, entity):
    indexes = []
    for ent in sent.ents:
        if ent.text==entity:
            for i in range(int(ent.start),
                           int(ent.end)):
                indexes.append(i)
    return indexes

def extract_information(sent, entity, indexes):
    actions = []
    action = ""
    participant1 = ""
    participant2 = ""
        
    for token in sent:
        if token.pos_=="VERB" and token.dep_=="ROOT":  
            subj_ind = -1
            obj_ind = -1
            action = token.text
            children = [child for child in token.children]   
            for child1 in children:
                if child1.dep_=="nsubj":
                    participant1 = child1.text
                    subj_ind = int(child1.i)
                if child1.dep_=="prep":
                    participant2 = ""
                    child1_children = [child for child in child1.children]
                    for child2 in child1_children:
                        if child2.pos_ == "NOUN" or child2.pos_ == "PROPN":
                            participant2 = child2.text
                            obj_ind = int(child2.i)
                    if not participant2=="":
                        if subj_ind in indexes:
                            actions.append(entity + " " + action + " " + child1.text + " " + participant2)
                        elif obj_ind in indexes:
                            actions.append(participant1 + " " + action + " " + child1.text + " " + entity)
                if child1.dep_=="dobj" and (child1.pos_ == "NOUN"
                                            or child1.pos_ == "PROPN"):
                    participant2 = child1.text
                    obj_ind = int(child1.i)
                    if subj_ind in indexes:
                        actions.append(entity + " " + action + " " + participant2)
                    elif obj_ind in indexes:
                        actions.append(participant1 + " " + action + " " + entity)
                    
    if not len(actions)==0:
        print (f"\nSentence = {sent}")
        for item in actions:
            print(item)
for sent in sentences:
    doc = nlp(sent)
    indexes = extract_span(doc, entity)
    print(indexes)
    extract_information(doc, entity, indexes)

[0, 1, 2, 3]

Sentence = The New York Times wrote about apple
The New York Times wrote about apple


### Detecting sentences with specified entity

In [None]:
def entity_detector(processed_docs, entity, ent_type):
    output_sentences = []
    for doc in processed_docs:
        for sent in doc.sents:
            if entity in [ent.text for ent in sent.ents if ent.label_==ent_type]:
                output_sentences.append(sent)
    return output_sentences

entity = 'Apple'
ent_sentences = entity_detector(
    processed_docs, entity, 'ORG'
)
print(len(ent_sentences))

61


### Extract information from the sentences with the specified entity:

In [None]:
for sent in ent_sentences:
    indexes = extract_span(sent, entity)
    extract_information(sent, entity, indexes)


Sentence = Apple, complying with what it said was a request from Chinese authorities, removed news apps created by The New York Times from its app store in China late last month.
Apple removed apps

Sentence = Apple removed both the   and   apps from the app store in China on Dec. 23.
Apple removed apps
Apple removed from store
Apple removed on Dec.

Sentence = Apple has previously removed other, less prominent media apps from its China store.
Apple removed apps

Sentence = It puts Apple and Google in a difficult position.
It puts Apple

Sentence = Russia required Apple and Google to remove the LinkedIn app from their local stores.
Russia required Apple

Sentence = On Friday, Apple, its longtime partner, sued Qualcomm over what it said was $1 billion in withheld rebates.
Apple sued Qualcomm

Sentence = Apple sued three days after the  Federal Trade Commission accused Qualcomm of using anticompetitive practices to guarantee its high royalty payments for advanced wireless technology.
Ap

In [None]:
entity = 'The New York Times'
ent_sentences = entity_detector(
    processed_docs, entity, 'ORG'
)
print(len(ent_sentences))
for sent in ent_sentences:
    indexes = extract_span(sent, entity)
    extract_information(sent, entity, indexes)

230

Sentence = [The New York Times] • Pan Pan, a    panda who fathered nearly a quarter of the world’s captive pandas, died last week at a conservation center in China’s Sichuan Province.
The New York Times died at center

Sentence = The New York Times reported at the time.
The New York Times reported at time

Sentence = The New York Times spoke to five people in the   to    age group, a small sample of millennial savers.
The New York Times spoke to people
The New York Times spoke to group

Sentence = Times Insider delivers    insights into life at The New York Times.
Insider delivers at The New York Times

Sentence = The New York Times called Armstrong one of the “great inventive geniuses in electrical engineering” after his death in 1954.
The New York Times called Armstrong
The New York Times called after death

Sentence = “It is now heavy rain and melting snow, which is causing flooding in the camp,” Mr. Kempson wrote to The New York Times, via Facebook, describing the conditions i

### Visualising with Displacy:

In [None]:
from spacy import displacy

In [None]:
def visualise(processed_docs, entity, ent_type):
    for doc in processed_docs:
        for sent in doc.sents:
            if entity in [ent.text for ent in sent.ents if ent.label_==ent_type]:
                displacy.render(sent, style="ent",
                                jupyter=True)

In [None]:
visualise(processed_docs,'India','GPE')

In [None]:
visualise(processed_docs, "Narendra Modi", 'PERSON')

In [None]:
visualise(processed_docs, 'Apple','ORG')

In [None]:
visualise(processed_docs, 'Google','ORG')

In [None]:
visualise(processed_docs,'Hindu', 'NORP')