## Assignment 4
by Charlie Mei cm3947

In [1]:
from urllib import request
from bs4 import BeautifulSoup
from bs4.element import Comment

import spacy
from spacy.gold import docs_to_json
from spacy.util import minibatch, compounding
from spacy.pipeline import SentenceSegmenter
import random

from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [2]:
# Extract all body text from the url, using code provided by lecturer in class 3 exercise
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

### 1. Pick a random news article (preferably with many entity mentions) from your Webhose dataset 

In [3]:
# Take the first article from the Netflix Webhose dataset provided in Assignment 2
url = 'https://www.stuff.co.nz/entertainment/tv-radio/300026661/13-reasons-why-the-popular-netflix-shows-creator-teases-chance-of-a-hopeful-ending'

In [4]:
# Get all body text from the webpage
html = request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.findAll(text=True)

In [5]:
text = text_from_html(html)
print(text[:1000])

National World Business Climate Change Sport Entertainment Life & Style Homed Travel Motoring Stuff Nation Play Stuff Quizzes Politics Premium Well & Good Food & Wine Parenting Rugby Farming Technology Opinion Auckland Wellington Canterbury Waikato Bay of Plenty Taranaki Manawatu Nelson Marlborough Timaru Otago Southland Careers Advertising Contact Privacy © 2020 Stuff Limited Entertainment TV & Radio 13 Reasons Why: The popular Netflix show's creator teases chance of a hopeful ending 14:49, Jun 03 2020 Facebook Twitter Whats App Reddit Email NETFLIX The final season of 13 Reasons Why is out. The controversial 13  Reasons Why is returning for its fourth and final season on Netflix from Friday and creator Brian Yorkey has indicated there will be a hopeful ending. Adapted from Jay Asher's 2007 novel, the show was released on Netflix in 2017 and began with the first season focused on the death of Hannah Baker, a 17-year-old American high school student who


### 2. Follow directions to set up one of the Information Extraction services below, and write a Python program implementing API calls to extract Company/Organization and Geo entities from  the article chosen in Step 1:

I have chosen to use ```SpaCy```.

In [6]:
nlp = spacy.load("en_core_web_sm")

# Parse through text from webpage into a spacy nlp
page = nlp(text)

In [7]:
# Entity label in spacy for company/organization and geo entities
entity_labels = ['ORG', 'GPE']

# Extract companies and geo entities from the article
orgs = []
geos = []
for entity in page.ents:
    if entity.label_ == 'ORG':
        orgs.append(entity.text)
    elif entity.label_ == 'GPE':
        geos.append(entity.text)
    else:
        continue

print("Here are a list of companies/organizations referenced in the article: \n {}".format(set(orgs)))
print("Here are a list of geographies referenced in the article: \n {}".format(set(geos)))

Here are a list of companies/organizations referenced in the article: 
 {'Premium Well & Good Food & Wine', 'Mental Health Foundation', 'Entertainment Weekly', 'Urban', 'Stuff Limited Entertainment TV & Radio', 'Super Rugby Aotearoa', 'Nelson Marlborough Timaru', 'Netflix', 'Yorkey'}
Here are a list of geographies referenced in the article: 
 {'Victoria', 'Auckland', 'Australia', 'Netflix', "New Zealand's", 'North Star', 'Wellington', 'Manawatu', 'Yorkey'}


### 3. Download Crunchbase Open Data Map CSV file and store it in a directory on your computer

### 4. Use the Class Exercise Jupyter Notebook as a reference to:
- !pip install pyspark
- load Crunchbase Open Data Map into notebook by modifying the path .csv(".../...") to the file on your computer where you stored the downloaded CSV file from Step 4.
- find matches of Company or Organization entities identified in Step 3 using rlike function and print results

In [8]:
# Initializing spark to load the Couch database
sc = SparkContext()
config = sc.getConf()
sqlContext = SQLContext(sc)

In [9]:
# Load in the Couch DB
df = sqlContext.read.option('header', 'true').option('delimiter', ',').option('inferSchema', 'true').csv('cb_odm_092419.csv')
df.count()

687755

In [10]:
for org in set(orgs):
    print('Matches in the Couch DB for {}:'.format(org))
    match_df = df[df['name'].rlike(org)]
    match_df['crunchbase_uuid', 'name', 'homepage_domain'].show()
    print()

Matches in the Couch DB for Premium Well & Good Food & Wine:
+---------------+----+---------------+
|crunchbase_uuid|name|homepage_domain|
+---------------+----+---------------+
+---------------+----+---------------+


Matches in the Couch DB for Mental Health Foundation:
+--------------------+--------------------+-------------------+
|     crunchbase_uuid|                name|    homepage_domain|
+--------------------+--------------------+-------------------+
|b1bf4f5d-e5e0-b71...|Mental Health Fou...|mentalhealth.org.uk|
+--------------------+--------------------+-------------------+


Matches in the Couch DB for Entertainment Weekly:
+--------------------+--------------------+---------------+
|     crunchbase_uuid|                name|homepage_domain|
+--------------------+--------------------+---------------+
|f523d9c9-f482-d9b...|Entertainment Weekly|         ew.com|
+--------------------+--------------------+---------------+


Matches in the Couch DB for Urban:
+-----------------

### BONUS

Use the Class Exercise Jupyter Notebook as a reference to:
- !pip install spacy 
- update TRAIN_DATA with annotations of entities (PERSON, LOCATION, or ORGANIZATION) from each sentence in the article selected in step 1
- run spaCy_NER function to generate trained_nlp model
- use trained_nlp to test entity recognition on another random news article from Webhose and print results to output

#### Establish the training dataset

In [11]:
# Break up into body text into sentences
sentences = [sentence for sentence in page.sents]
sentences[1]

Homed Travel Motoring Stuff Nation Play Stuff Quizzes Politics Premium Well & Good Food & Wine

In [12]:
# Function to extract entities from each sentence
relevant_entities = ['GPE', 'ORG', 'PERSON']

def extractEntityInfo(sent, entities):
    ents = []
    for ent in sent.ents:
        if ent.label_ in entities:
            ents.append((ent.start_char, ent.end_char, ent.label_))
        else:
            continue
    return (str(sent), {'entities': ents})

In [13]:
extractEntityInfo(sentences[1], relevant_entities)

('Homed Travel Motoring Stuff Nation Play Stuff Quizzes Politics Premium Well & Good Food & Wine',
 {'entities': [(168, 199, 'ORG')]})

In [14]:
# Build the training dataset
TRAIN_DATA = []
for sentence in sentences:
    TRAIN_DATA.append(extractEntityInfo(sentence, relevant_entities))
TRAIN_DATA[:5]



[('                                 National World Business Climate Change Sport Entertainment Life & Style',
  {'entities': []}),
 ('Homed Travel Motoring Stuff Nation Play Stuff Quizzes Politics Premium Well & Good Food & Wine',
  {'entities': [(168, 199, 'ORG')]}),
 ('Parenting Rugby Farming Technology Opinion Auckland Wellington Canterbury Waikato Bay of Plenty',
  {'entities': [(243, 251, 'GPE')]}),
 ('Taranaki Manawatu Nelson Marlborough Timaru',
  {'entities': [(305, 313, 'GPE'), (314, 339, 'ORG')]}),
 ('Otago Southland Careers', {'entities': []})]

#### Build NER training model

In [15]:
# Define a NER training model, adapted from lecture slides
def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')

    # Add NER trainer to pipe
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    
    # Add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # Disable all other pipes and just train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [16]:
prdnlp = train_spacy(TRAIN_DATA, 10)

Starting iteration 0
{'ner': 114.39401927567909}
Starting iteration 1
{'ner': 1.1293316454665398e-16}
Starting iteration 2
{'ner': 6.377145992719412e-18}
Starting iteration 3
{'ner': 1.4138139394281897e-16}
Starting iteration 4
{'ner': 8.626094947986659e-17}
Starting iteration 5
{'ner': 4.309852801873665e-16}
Starting iteration 6
{'ner': 1.0347297006906193e-16}
Starting iteration 7
{'ner': 1.2138217501899623e-16}
Starting iteration 8
{'ner': 2.0081938945163972e-17}
Starting iteration 9
{'ner': 2.900061819191236e-17}


#### Test trained model on new text

In [18]:
# Get all body text from another webpage from webhose dataset from above
url2 = 'https://www.stuff.co.nz/entertainment/entertainment-top-stories/300026292/judge-gives-control-of-tiger-king-joe-exotics-zoo-to-carole-baskin'

html = request.urlopen(url2).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.findAll(text=True)

text2 = text_from_html(html)

In [19]:
# Run the trained model
test_doc = prdnlp(text2)

In [20]:
for ent in test_doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)