In [64]:
import re
import string
from collections import Counter
from nltk.corpus import stopwords
import spacy
from spacy import displacy


def clean(text):
    doc = ''.join(text).lower()
    doc = re.sub(r'[<>\{}/|\[\]-]', ' ', doc)
    doc = re.sub(r'[0-9]', ' ', doc)
    doc = re.sub(r'\'', ' ', doc)
    doc = re.sub(r'=', ' ', doc)
    doc = re.sub(r':', ' ', doc)
    doc = re.sub(r'"', ' ', doc)
    doc = re.sub(r'\s+', ' ', doc)
    
    return doc



wiki_doc = None

with open('../data/events/clinton_lewinsky_affr.wiki') as fl:
    wiki_doc = fl.readlines()

#Remove URLs First?
wiki_doc = clean(wiki_doc)


nlp = spacy.load('en_core_web_lg')
doc_nlped = nlp(doc)


items = []
for x in doc_nlped.ents:
    if x.label_ == 'PERSON':
        items.append(x.text)
potential_entities = Counter(items).most_common(10)

In [65]:
potential_entities

[('lewinsky', 20),
 ('clinton', 6),
 ('jones', 3),
 ('robert livingston', 3),
 ('richard', 2),
 ('susan webber wright', 2),
 ('kenneth starr', 2),
 ('kalb', 2),
 ('michael', 2),
 ('starr', 2)]

### Knowledge Graph

In [76]:
 # Authorize server-to-server interactions from Google Compute Engine.

from googleapiclient.discovery import build
service = build('kgsearch', 'v1', developerKey='AIzaSyDpC2xZhGIpB5DXwKiFfnzE4GjGY8Jxsmk')


In [80]:
res = service.entities


<bound method Resource._add_nested_resources.<locals>.createResourceMethod.<locals>.methodResource of <googleapiclient.discovery.Resource object at 0x1acb9b7e48>>

#### Misc Spacy

In [39]:
doc

' billclintonseries the clinton–lewinsky scandal was an american political scandal political sex scandal that involved year old president of the united states of america president bill clinton and year old white house intern monica lewinsky . the sexual relationship took place between and and came to light in . clinton ended a televised speech in late january with the statement that he did not have sexual relations with that woman, miss lewinsky . further investigation led to charges of perjury and to the impeachment of bill clinton impeachment of president clinton in by the united states house of representatives u.s. house of representatives . he was subsequently acquitted on all impeachment charges of perjury and obstruction of justice in a day united states senate senate trial. ref cite book last posner first richard a, title an affair of state the investigation, impeachment, and trial of president clinton url https www.nytimes.com books first p posner affair.html accessdate march ,

In [44]:
#sentence_spans = list(doc_nlped.sents)
#displacy.serve(sentence_spans, style='dep')
displacy.render(doc_nlped, style='ent', jupyter=True)

spacy.tokens.doc.Doc

In [None]:
labels = [x.label_ for x in doc_nlped.ents]
Counter(labels)

In [None]:
for ent in doc_nlped.ents:
    if ent.label_ == 'PERSON':
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

### Moving to scraping for names

In [158]:
from bs4 import BeautifulSoup
import requests

In [166]:
clinton_affair = requests.get('https://en.wikipedia.org/wiki/Clinton%E2%80%93Lewinsky_scandal')
soup = BeautifulSoup(clinton_affair.text, 'html.parser')

In [167]:
soup.find_all('a')

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="image" href="/wiki/File:Bill_Clinton.jpg"><img alt="Bill Clinton.jpg" class="thumbborder" data-file-height="3000" data-file-width="2299" decoding="async" height="98" src="//upload.wikimedia.org/wikipedia/commons/thumb/d/d3/Bill_Clinton.jpg/75px-Bill_Clinton.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/d/d3/Bill_Clinton.jpg/113px-Bill_Clinton.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/d/d3/Bill_Clinton.jpg/150px-Bill_Clinton.jpg 2x" width="75"/></a>,
 <a href="/wiki/Bill_Clinton" title="Bill Clinton"><span style="color: #FFF">Bill Clinton</span></a>,
 <a href="/wiki/Clintonism" title="Clintonism">Political positions</a>,
 <a href="/wiki/Electoral_history_of_Bill_Clinton" title="Electoral history of Bill Clinton">Electoral history</a>,
 <a href="/wiki/Early_life_and_career_of_Bill_Clinton" title="Early li

### Reset, Crawling for Casting Pool

In [4]:
from bs4 import BeautifulSoup
import requests

actors = requests.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor')
actresses = requests.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress')
sup_actors = requests.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor')
sup_actresses = requests.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actress')


In [5]:
soup_act = BeautifulSoup(actors.text, 'html.parser')
soup_actress = BeautifulSoup(actresses.text, 'html.parser')
soup_sup_act = BeautifulSoup(sup_actors.text, 'html.parser')
soup_sup_actress = BeautifulSoup(sup_actresses.text, 'html.parser')


In [6]:
table_act = soup_act.find_all('table')[2]
table_actress = soup_actress.find_all('table')[2]
table_sup_act = soup_sup_act.find_all('table')[2]
table_sup_actress = soup_sup_actress.find_all('table')[2]


In [153]:
table

<table border="2" cellpadding="4" class="wikitable sortable">
<tbody><tr>
<th scope="col" style="width:8%;">Year
</th>
<th scope="col" style="width:25%;">Actress
</th>
<th scope="col" style="width:30%;">Role(s)
</th>
<th scope="col" style="width:70%;">Film
</th>
<th class="unsortable" scope="col" style="width:2%;"><abbr title="Reference">Ref.</abbr>
</th></tr>
<tr>
<th rowspan="3" scope="row" style="text-align:center"><a href="/wiki/1927_in_film" title="1927 in film">1927</a>/<a href="/wiki/1928_in_film" title="1928 in film">28</a> <br/><small><a href="/wiki/1st_Academy_Awards" title="1st Academy Awards">(1st)</a> </small>
</th>
<td style="background:#FAEB86;"><b><span data-sort-value="Gaynor !"><a href="/wiki/Janet_Gaynor" title="Janet Gaynor">Janet Gaynor</a></span> <img alt="Award winner" data-file-height="14" data-file-width="9" decoding="async" height="14" src="//upload.wikimedia.org/wikipedia/commons/f/f9/Double-dagger-14-plain.png" width="9"/></b>
</td>
<td style="background:#FA

In [13]:
rows_act = table_act.find_all('tr')[300:]
rows_actress = table_actress.find_all('tr')[290:]
rows_sup_act = table_sup_act.find_all('tr')[290:]
rows_sup_actress = table_sup_actress.find_all('tr')[290:]

In [20]:
len(rows_sup_act)

135

In [27]:
rows_sup_actress[4].td.a['href']

'/wiki/Holly_Hunter'

In [31]:
#best_actresses = []
#best_actor_noms = []
#supporting_actors = []
supporting_actresses = []



for actr in rows_sup_actress:
    try:
       supporting_actresses.append('https://en.wikipedia.org' + actr.td.a['href'])
    except:
        continue

In [136]:
best_actor_noms

['https://en.wikipedia.org/wiki/Gregory_Peck',
 'https://en.wikipedia.org/wiki/Richard_Todd',
 'https://en.wikipedia.org/wiki/John_Wayne',
 'https://en.wikipedia.org/wiki/Jos%C3%A9_Ferrer',
 'https://en.wikipedia.org/wiki/Louis_Calhern',
 'https://en.wikipedia.org/wiki/William_Holden',
 'https://en.wikipedia.org/wiki/James_Stewart',
 'https://en.wikipedia.org/wiki/Spencer_Tracy',
 'https://en.wikipedia.org/wiki/Humphrey_Bogart',
 'https://en.wikipedia.org/wiki/Marlon_Brando',
 'https://en.wikipedia.org/wiki/Montgomery_Clift',
 'https://en.wikipedia.org/wiki/Arthur_Kennedy',
 'https://en.wikipedia.org/wiki/Fredric_March',
 'https://en.wikipedia.org/wiki/Gary_Cooper',
 'https://en.wikipedia.org/wiki/Marlon_Brando',
 'https://en.wikipedia.org/wiki/Kirk_Douglas',
 'https://en.wikipedia.org/wiki/Jos%C3%A9_Ferrer',
 'https://en.wikipedia.org/wiki/Alec_Guinness',
 'https://en.wikipedia.org/wiki/William_Holden',
 'https://en.wikipedia.org/wiki/Marlon_Brando',
 'https://en.wikipedia.org/wiki/Ri

In [32]:
supporting_actors[0]

'https://en.wikipedia.org/wiki/Marlon_Brando'

In [165]:
best_actresses

['https://en.wikipedia.org/wiki/Rachel_Roberts_(actress)',
 'https://en.wikipedia.org/wiki/Natalie_Wood',
 'https://en.wikipedia.org/wiki/Julie_Andrews',
 'https://en.wikipedia.org/wiki/Anne_Bancroft',
 'https://en.wikipedia.org/wiki/Sophia_Loren',
 'https://en.wikipedia.org/wiki/Debbie_Reynolds',
 'https://en.wikipedia.org/wiki/Kim_Stanley',
 'https://en.wikipedia.org/wiki/Julie_Christie',
 'https://en.wikipedia.org/wiki/Julie_Andrews',
 'https://en.wikipedia.org/wiki/Samantha_Eggar',
 'https://en.wikipedia.org/wiki/Elizabeth_Hartman',
 'https://en.wikipedia.org/wiki/Simone_Signoret',
 'https://en.wikipedia.org/wiki/Elizabeth_Taylor',
 'https://en.wikipedia.org/wiki/Anouk_Aim%C3%A9e',
 'https://en.wikipedia.org/wiki/Ida_Kami%C5%84ska',
 'https://en.wikipedia.org/wiki/Lynn_Redgrave',
 'https://en.wikipedia.org/wiki/Vanessa_Redgrave',
 'https://en.wikipedia.org/wiki/Katharine_Hepburn',
 'https://en.wikipedia.org/wiki/Anne_Bancroft',
 'https://en.wikipedia.org/wiki/Faye_Dunaway',
 'https

In [33]:
academy_roles = [best_actor_noms, supporting_actors, best_actresses, supporting_actresses]

In [34]:
for r in academy_roles:
    print (len(r))

148
133
162
133


In [36]:
#import pickle
#with open ('a_awrd_actor_urls_v2.txt', 'wb') as fl:
#    pickle.dump(academy_roles, fl)

In [3]:
academy_roles = None
import pickle
with open ('a_awrd_actor_urls_v2.txt', 'rb') as fl:
    academy_roles = pickle.load(fl)

EOFError: Ran out of input

In [104]:
test_actr = requests.get(best_actor_noms[55])
soup = BeautifulSoup(test_actr.text, 'html.parser')

In [105]:
soup = BeautifulSoup(test_actr.text, 'html.parser')

In [81]:
body = soup.find(id='bodyContent')
type(body)

bs4.element.Tag

In [111]:
body = soup.find('div', id='mw-content-text').find('div')
body
#mw-content-text
#body = soup.find('div', id='bodyContent')
#mw-content-text > div

<div class="mw-parser-output"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">American actor and producer</div>
<div class="hatnote navigation-not-searchable" role="note">This article is about the American actor. For the seismologist, see <a href="/wiki/Thomas_C._Hanks" title="Thomas C. Hanks">Thomas C. Hanks</a>.</div>
<p class="mw-empty-elt">
</p>
<table class="infobox biography vcard" style="width:22em"><tbody><tr><th colspan="2" style="text-align:center;font-size:125%;font-weight:bold"><div class="fn" style="display:inline">Tom Hanks</div></th></tr><tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:Tom_Hanks_2016.jpg"><img alt="Tom Hanks 2016.jpg" data-file-height="3115" data-file-width="2503" decoding="async" height="274" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Tom_Hanks_2016.jpg/220px-Tom_Hanks_2016.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Tom_Hanks_2016.jpg/330px-Tom_Han

In [77]:
def is_reference_heading(tag):
    
    span = None
    
    if tag.name == 'h2':
    
        try:
            span = tag.contents[0]

        except:
            #Nothing found in tag, just return False
            return False
       
        if span.name == 'span':
       
            if span['id'] == 'References':
                
                return True
    
    return False
    
    

In [59]:
ref_tag = body.find_all('h2')[9]
ref_tag

<h2><span class="mw-headline" id="References">References</span></h2>

In [80]:
is_reference_heading(ref_tag)

True

In [74]:
ref_tag.contents[0]

<span class="mw-headline" id="References">References</span>

In [113]:
import bs4
found_reference_tag = False
for child in body.children:   
    if not is_reference_heading(child):
        if type(child) is bs4.element.Tag:
            
            print (' ' + child.get_text() + ' ' )
    else:
        print ("STOPP")
        break

 American actor and producer 
 This article is about the American actor. For the seismologist, see Thomas C. Hanks. 
 
 
 Tom HanksHanks in September 2016BornThomas Jeffrey Hanks (1956-07-09) July 9, 1956 (age 62)Concord, California, U.S.ResidenceLos Angeles, California, U.S.Alma materChabot CollegeCalifornia State University, Sacramento (BFA)OccupationActor, filmmakerYears active1977–presentWorksPerformancesNet worth$390 million (May 2014)[1]Political partyDemocraticSpouse(s)Samantha Lewes(m. 1978; div. 1987)Rita Wilson (m. 1988)Children4, including Colin HanksRelativesJim Hanks (brother)Larry Hanks (brother)AwardsFull list 
 Thomas Jeffrey Hanks (born July 9, 1956) is an American actor and filmmaker. Hanks is known for his comedic and dramatic roles in such films as Splash (1984), Big (1988), Turner & Hooch (1989), A League of Their Own (1992), Sleepless in Seattle (1993), Apollo 13 (1995), You've Got Mail (1998), The Green Mile (1999), Cast Away (2000), Road to Perdition (2002), Clo

In [125]:
from bs4 import BeautifulSoup
import requests

def extract_wikipedia_body(response):
    
    page_txts = []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    body = soup.find('div', id='mw-content-text').find('div')
    
    found_reference_tag = False
    
    for child in body.children:   
        if not is_reference_heading(child):
            if type(child) is bs4.element.Tag:

                page_txts.append(child.get_text())
        else:
            break
    
    return ' '.join(page_txts)

def is_reference_heading(tag):
    
    span = None
    
    if tag.name == 'h2':
    
        try:
            span = tag.contents[0]

        except:
            #Nothing found in tag, just return False
            return False
       
        if span.name == 'span':
       
            if span['id'] == 'References':
                
                return True
    
    return False
    
    

In [126]:
extract_wikipedia_body(requests.get(best_actor_noms[55]))

'American actor and producer This article is about the American actor. For the seismologist, see Thomas C. Hanks. \n Tom HanksHanks in September 2016BornThomas Jeffrey Hanks (1956-07-09) July 9, 1956 (age\xa062)Concord, California, U.S.ResidenceLos Angeles, California, U.S.Alma\xa0materChabot CollegeCalifornia State University, Sacramento (BFA)OccupationActor, filmmakerYears\xa0active1977–presentWorksPerformancesNet\xa0worth$390\xa0million (May 2014)[1]Political partyDemocraticSpouse(s)Samantha Lewes(m.\xa01978; div.\xa01987)Rita Wilson (m.\xa01988)Children4, including Colin HanksRelativesJim Hanks (brother)Larry Hanks (brother)AwardsFull list Thomas Jeffrey Hanks (born July 9, 1956) is an American actor and filmmaker. Hanks is known for his comedic and dramatic roles in such films as Splash (1984), Big (1988), Turner & Hooch (1989), A League of Their Own (1992), Sleepless in Seattle (1993), Apollo 13 (1995), You\'ve Got Mail (1998), The Green Mile (1999), Cast Away (2000), Road to Per

In [218]:
doc = ''
for tag in soup.select('#bodyContent'):
    doc = doc + tag.get_text() + ' '

In [219]:
doc

'\nFrom Wikipedia, the free encyclopedia \nThis is the latest accepted revision, reviewed on 10 February 2019.\n\n\n\n Jump to navigation\nJump to search\n\n\nAl PacinoPacino in 2004BornAlfredo James Pacino (1940-04-25) April 25, 1940 (age\xa078)New York City, U.S.Alma\xa0materActors StudioHB StudioOccupation\nActor\nfilmmaker\nYears\xa0active1967–presentPartner(s)Jan Tarrant (1988–1989)Beverly D\'Angelo (1997–2003)Children3\nAlfredo James Pacino (/pəˈtʃiːnoʊ/; born April 25, 1940) is an American actor and filmmaker.[1] Pacino has had a career spanning more than five decades, during which time he has received numerous accolades and honors both competitive and honorary, among them an Academy Award, two Tony Awards, two Primetime Emmy Awards, a British Academy Film Award, four Golden Globe Awards, the Lifetime Achievement Award from the American Film Institute, the Golden Globe Cecil B. DeMille Award, and the National Medal of Arts. He is one of few performers to have won a competitive O

In [217]:
clean(doc)

' from wikipedia, the free encyclopedia this is the latest accepted revision, reviewed on february . jump to navigation jump to search al pacinopacino in bornalfredo james pacino ( ) april , (age )new york city, u.s.alma materactors studiohb studiooccupation actor filmmaker years active –presentpartner(s)jan tarrant ( – )beverly d angelo ( – )children alfredo james pacino ( pəˈtʃiːnoʊ ; born april , ) is an american actor and filmmaker. pacino has had a career spanning more than five decades, during which time he has received numerous accolades and honors both competitive and honorary, among them an academy award, two tony awards, two primetime emmy awards, a british academy film award, four golden globe awards, the lifetime achievement award from the american film institute, the golden globe cecil b. demille award, and the national medal of arts. he is one of few performers to have won a competitive oscar, an emmy, and a tony award for acting, dubbed the triple crown of acting . a met

### Testing Cleaning Methods

In [206]:
test_text = str(soup.select('#bodyContent')[0])

In [207]:
re.sub(r'<.*>', ' ', test_text)

' \n \n \n \n \n \n \n \n \n \n \n \n \n \n ".\n  (1990). Pacino\'s performance as Michael Corleone in these films is regarded as one of the greatest screen performances in film history.\n  for each role.\n .\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  after gaining admission by audition.\nHis mother disagreed with his decision and, after an argument, he left home. To finance his acting studies, Pacino took low-paying jobs as messenger,  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n , but the company was unable to do it at the time due to the small cast.\n \n \n , a play written for him by Mamet. This was in a limited run of 87 performances following the acclaimed reviews of four performances in October 2015.\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  The film received mixed reviews, and had problems in pre-production due to script rewrites and the withdrawal of actors shortly before production.\n \n \n \n \n .\n \n \n \n \n \n

In [205]:
type(test_text)

NoneType

In [197]:
len(test_text)

1

#### Scrape to Mongo

In [None]:
for each url

doc = ''
for tag in soup.select('#bodyContent'):
    doc = doc + tag.get_text() + ' '
    
clean(doc)

set a few props

actor_name = soup.select('#firstHeading')[0].get_text()


store in mongo
actor_name
url
doc

In [127]:
import pymongo
from pymongo import MongoClient

client = MongoClient()

db = client.conan
casting_pool = db.casting_pool

In [131]:
import re
def clean(text):
    doc = ''.join(text).lower()
    doc = re.sub(r'[<>\{}/|\[\]-]', ' ', doc)
    doc = re.sub(r'[0-9]', ' ', doc)
    doc = re.sub(r'\'', ' ', doc)
    doc = re.sub(r'=', ' ', doc)
    doc = re.sub(r':', ' ', doc)
    doc = re.sub(r'"', ' ', doc)
    doc = re.sub(r'\s+', ' ', doc)
    
    return doc

In [128]:
test_actr = requests.get(best_actor_noms[55])
soup = BeautifulSoup(test_actr.text, 'html.parser')

In [132]:
test_actr2 = {"name": soup.select('#firstHeading')[0].get_text()
              ,"url": best_actor_noms[55]
              ,"wiki_doc": extract_wikipedia_body(test_actr)

In [226]:
first_actor = {"name": soup.select('#firstHeading')[0].get_text()
              ,"url": "FAKE"
              ,"wiki_doc": clean(doc)}

In [227]:
first_actor

{'name': 'Al Pacino',
 'url': 'FAKE',
 'wiki_doc': ' from wikipedia, the free encyclopedia this is the latest accepted revision, reviewed on february . jump to navigation jump to search al pacinopacino in bornalfredo james pacino ( ) april , (age )new york city, u.s.alma materactors studiohb studiooccupation actor filmmaker years active –presentpartner(s)jan tarrant ( – )beverly d angelo ( – )children alfredo james pacino ( pəˈtʃiːnoʊ ; born april , ) is an american actor and filmmaker. pacino has had a career spanning more than five decades, during which time he has received numerous accolades and honors both competitive and honorary, among them an academy award, two tony awards, two primetime emmy awards, a british academy film award, four golden globe awards, the lifetime achievement award from the american film institute, the golden globe cecil b. demille award, and the national medal of arts. he is one of few performers to have won a competitive oscar, an emmy, and a tony award fo

In [133]:
test_actr2

{'name': 'Tom Hanks',
 'url': 'https://en.wikipedia.org/wiki/Tom_Hanks',
 'wiki_doc': 'American actor and producer This article is about the American actor. For the seismologist, see Thomas C. Hanks. \n Tom HanksHanks in September 2016BornThomas Jeffrey Hanks (1956-07-09) July 9, 1956 (age\xa062)Concord, California, U.S.ResidenceLos Angeles, California, U.S.Alma\xa0materChabot CollegeCalifornia State University, Sacramento (BFA)OccupationActor, filmmakerYears\xa0active1977–presentWorksPerformancesNet\xa0worth$390\xa0million (May 2014)[1]Political partyDemocraticSpouse(s)Samantha Lewes(m.\xa01978; div.\xa01987)Rita Wilson (m.\xa01988)Children4, including Colin HanksRelativesJim Hanks (brother)Larry Hanks (brother)AwardsFull list Thomas Jeffrey Hanks (born July 9, 1956) is an American actor and filmmaker. Hanks is known for his comedic and dramatic roles in such films as Splash (1984), Big (1988), Turner & Hooch (1989), A League of Their Own (1992), Sleepless in Seattle (1993), Apollo 13

In [231]:
casting_pool.insert_one(first_actor)

<pymongo.results.InsertOneResult at 0x132050bc8>

In [235]:
import time

for actr in academy:
    
    clean_doc = ''
    actor_name = ''
    
    try:
        actr_html = requests.get(actr)
    
    except:
        print(f'Request failed for {actr}')
        continue
        
    try:
        
        soup = BeautifulSoup(actr_html.text, 'html.parser')
        
        doc = ''
        
        for tag in soup.select('#bodyContent'):
            doc = doc + tag.get_text() + ' '
            
        clean_doc = clean(doc)
        
    except:
        
        print(f'Doc Cleaning failed for {actr}')
        continue
        
    try:
        actor_name = soup.select('#firstHeading')[0].get_text()
        
    except:
        print(f'Falied to find name in Heading for {actr}')
        continue
    
    
    try:
        actor_ = {"name": actor_name, "url": actr, "wiki_doc": clean_doc}
        
        casting_pool.insert_one(actor_)
        print(f'inserted {actor_name} into MongoDB.')
    except:
        continue
    
    
    time.sleep(2)
    
        
    

inserted Peter O'Toole into MongoDB.
inserted Anthony Quinn into MongoDB.
inserted Peter Sellers into MongoDB.
inserted Lee Marvin into MongoDB.
inserted Richard Burton into MongoDB.
inserted Laurence Olivier into MongoDB.
inserted Rod Steiger into MongoDB.
inserted Oskar Werner into MongoDB.
inserted Paul Scofield into MongoDB.
inserted Alan Arkin into MongoDB.
inserted Richard Burton into MongoDB.
inserted Michael Caine into MongoDB.
inserted Steve McQueen into MongoDB.
inserted Rod Steiger into MongoDB.
inserted Warren Beatty into MongoDB.
inserted Dustin Hoffman into MongoDB.
inserted Paul Newman into MongoDB.
inserted Spencer Tracy into MongoDB.
inserted Cliff Robertson into MongoDB.
inserted Alan Arkin into MongoDB.
inserted Alan Bates into MongoDB.
inserted Ron Moody into MongoDB.
inserted Peter O'Toole into MongoDB.
inserted John Wayne into MongoDB.
inserted Richard Burton into MongoDB.
inserted Dustin Hoffman into MongoDB.
inserted Peter O'Toole into MongoDB.
inserted Jon Voig

inserted Mickey Rourke into MongoDB.
inserted Jeff Bridges into MongoDB.
inserted George Clooney into MongoDB.
inserted Colin Firth into MongoDB.
inserted Morgan Freeman into MongoDB.
inserted Jeremy Renner into MongoDB.
inserted Colin Firth into MongoDB.
inserted Javier Bardem into MongoDB.
inserted Jeff Bridges into MongoDB.
inserted Jesse Eisenberg into MongoDB.
inserted James Franco into MongoDB.
inserted Jean Dujardin into MongoDB.
inserted Demián Bichir into MongoDB.
inserted George Clooney into MongoDB.
inserted Gary Oldman into MongoDB.
inserted Brad Pitt into MongoDB.
inserted Daniel Day-Lewis into MongoDB.
inserted Bradley Cooper into MongoDB.
inserted Hugh Jackman into MongoDB.
inserted Joaquin Phoenix into MongoDB.
inserted Denzel Washington into MongoDB.
inserted Matthew McConaughey into MongoDB.
inserted Christian Bale into MongoDB.
inserted Bruce Dern into MongoDB.
inserted Leonardo DiCaprio into MongoDB.
inserted Chiwetel Ejiofor into MongoDB.
inserted Eddie Redmayne in

inserted Benicio del Toro into MongoDB.
inserted Djimon Hounsou into MongoDB.
inserted Ken Watanabe into MongoDB.
inserted Morgan Freeman into MongoDB.
inserted Alan Alda into MongoDB.
inserted Thomas Haden Church into MongoDB.
inserted Jamie Foxx into MongoDB.
inserted Clive Owen into MongoDB.
inserted George Clooney into MongoDB.
inserted Matt Dillon into MongoDB.
inserted Paul Giamatti into MongoDB.
inserted Jake Gyllenhaal into MongoDB.
inserted William Hurt into MongoDB.
inserted Alan Arkin into MongoDB.
inserted Jackie Earle Haley into MongoDB.
inserted Djimon Hounsou into MongoDB.
inserted Eddie Murphy into MongoDB.
inserted Mark Wahlberg into MongoDB.
inserted Javier Bardem into MongoDB.
inserted Casey Affleck into MongoDB.
inserted Philip Seymour Hoffman into MongoDB.
inserted Hal Holbrook into MongoDB.
inserted Tom Wilkinson into MongoDB.
inserted Heath Ledger into MongoDB.
inserted Josh Brolin into MongoDB.
inserted Robert Downey Jr. into MongoDB.
inserted Philip Seymour Hof

inserted Emma Thompson into MongoDB.
inserted Catherine Deneuve into MongoDB.
inserted Mary McDonnell into MongoDB.
inserted Michelle Pfeiffer into MongoDB.
inserted Susan Sarandon into MongoDB.
inserted Holly Hunter into MongoDB.
inserted Angela Bassett into MongoDB.
inserted Stockard Channing into MongoDB.
inserted Emma Thompson into MongoDB.
inserted Debra Winger into MongoDB.
inserted Jessica Lange into MongoDB.
inserted Jodie Foster into MongoDB.
inserted Miranda Richardson into MongoDB.
inserted Winona Ryder into MongoDB.
inserted Susan Sarandon into MongoDB.
inserted Susan Sarandon into MongoDB.
inserted Elisabeth Shue into MongoDB.
inserted Sharon Stone into MongoDB.
inserted Meryl Streep into MongoDB.
inserted Emma Thompson into MongoDB.
inserted Frances McDormand into MongoDB.
inserted Brenda Blethyn into MongoDB.
inserted Diane Keaton into MongoDB.
inserted Kristin Scott Thomas into MongoDB.
inserted Emily Watson into MongoDB.
inserted Helen Hunt into MongoDB.
inserted Helen

inserted Maggie Smith into MongoDB.
inserted Olympia Dukakis into MongoDB.
inserted Norma Aleandro into MongoDB.
inserted Anne Archer into MongoDB.
inserted Anne Ramsey into MongoDB.
inserted Ann Sothern into MongoDB.
inserted Geena Davis into MongoDB.
inserted Joan Cusack into MongoDB.
inserted Frances McDormand into MongoDB.
inserted Michelle Pfeiffer into MongoDB.
inserted Sigourney Weaver into MongoDB.
inserted Brenda Fricker into MongoDB.
inserted Anjelica Huston into MongoDB.
inserted Lena Olin into MongoDB.
inserted Julia Roberts into MongoDB.
inserted Dianne Wiest into MongoDB.
inserted Whoopi Goldberg into MongoDB.
inserted Annette Bening into MongoDB.
inserted Lorraine Bracco into MongoDB.
inserted Diane Ladd into MongoDB.
inserted Mary McDonnell into MongoDB.
inserted Mercedes Ruehl into MongoDB.
inserted Diane Ladd into MongoDB.
inserted Juliette Lewis into MongoDB.
inserted Kate Nelligan into MongoDB.
inserted Jessica Tandy into MongoDB.
inserted Marisa Tomei into MongoDB.

['https://en.wikipedia.org/wiki/Peter_O%27Toole',
 'https://en.wikipedia.org/wiki/Anthony_Quinn',
 'https://en.wikipedia.org/wiki/Peter_Sellers',
 'https://en.wikipedia.org/wiki/Lee_Marvin',
 'https://en.wikipedia.org/wiki/Richard_Burton',
 'https://en.wikipedia.org/wiki/Laurence_Olivier',
 'https://en.wikipedia.org/wiki/Rod_Steiger',
 'https://en.wikipedia.org/wiki/Oskar_Werner',
 'https://en.wikipedia.org/wiki/Paul_Scofield',
 'https://en.wikipedia.org/wiki/Alan_Arkin',
 'https://en.wikipedia.org/wiki/Richard_Burton',
 'https://en.wikipedia.org/wiki/Michael_Caine',
 'https://en.wikipedia.org/wiki/Steve_McQueen',
 'https://en.wikipedia.org/wiki/Rod_Steiger',
 'https://en.wikipedia.org/wiki/Warren_Beatty',
 'https://en.wikipedia.org/wiki/Dustin_Hoffman',
 'https://en.wikipedia.org/wiki/Paul_Newman',
 'https://en.wikipedia.org/wiki/Spencer_Tracy',
 'https://en.wikipedia.org/wiki/Cliff_Robertson',
 'https://en.wikipedia.org/wiki/Alan_Arkin',
 'https://en.wikipedia.org/wiki/Alan_Bates',
 

In [233]:
academy = [item for sublist in academy_roles for item in sublist]

In [234]:
len(academy)

1040

In [None]:
title, roles[], event_doc, summary

In [239]:
event_art = requests.get('https://en.wikipedia.org/wiki/Cuban_Missile_Crisis')

In [240]:
clean_doc = ''
soup = BeautifulSoup(event_art.text, 'html.parser')
        
doc = ''
        
for tag in soup.select('#bodyContent'):
    doc = doc + tag.get_text() + ' '
            
clean_doc = clean(doc)
clean_doc




In [241]:
cmc_roles = ['https://en.wikipedia.org/wiki/John_F._Kennedy', 'https://en.wikipedia.org/wiki/Robert_McNamara', 'https://en.wikipedia.org/wiki/Jacqueline_Kennedy_Onassis'\
            , 'https://en.wikipedia.org/wiki/Robert_F._Kennedy', 'https://en.wikipedia.org/wiki/Nikita_Khrushchev']

In [243]:
import time
def get_person_obj(person_url_list):

    people = []

    for actr in person_url_list:

        clean_doc = ''
        actor_name = ''

        try:
            actr_html = requests.get(actr)

        except:
            print(f'Request failed for {actr}')
            continue

        try:

            soup = BeautifulSoup(actr_html.text, 'html.parser')

            doc = ''

            for tag in soup.select('#bodyContent'):
                doc = doc + tag.get_text() + ' '

            clean_doc = clean(doc)

        except:

            print(f'Doc Cleaning failed for {actr}')
            continue

        try:
            actor_name = soup.select('#firstHeading')[0].get_text()

        except:
            print(f'Falied to find name in Heading for {actr}')
            continue


        try:
            people.append({"name": actor_name, "url": actr, "wiki_doc": clean_doc})

            #casting_pool.insert_one(actor_)
            #print(f'inserted {actor_name} into MongoDB.')
            
        except:
            continue
    


        time.sleep(2)
        
    return people

In [244]:
cmc_people = get_person_obj(cmc_roles)

In [246]:
len(cmc_people)

5

In [247]:
cmc = {'title': 'Cuban Missile Crisis', 'roles': cmc_people, 'event_doc': clean_doc, 'summary': 'JFK Gets Nukes Out of Cuba.'}

In [249]:
# Roles need to be mongo ids, not actual entries

In [250]:
mongo_ids = []
for person in cmc_people:
    mongo_ids.append(casting_pool.insert_one(person).inserted_id)

In [251]:
mongo_ids

[ObjectId('5c6efd6dfc0c4b94ecceeef8'),
 ObjectId('5c6efd6dfc0c4b94ecceeef9'),
 ObjectId('5c6efd6dfc0c4b94ecceeefa'),
 ObjectId('5c6efd6dfc0c4b94ecceeefb'),
 ObjectId('5c6efd6dfc0c4b94ecceeefc')]

In [253]:
cmc = {'title': 'Cuban Missile Crisis', 'roles': mongo_ids, 'event_doc': clean_doc, 'summary': 'JFK Gets Nukes Out of Cuba.'}

In [254]:
cmc

{'title': 'Cuban Missile Crisis',
 'roles': [ObjectId('5c6efd6dfc0c4b94ecceeef8'),
  ObjectId('5c6efd6dfc0c4b94ecceeef9'),
  ObjectId('5c6efd6dfc0c4b94ecceeefa'),
  ObjectId('5c6efd6dfc0c4b94ecceeefb'),
  ObjectId('5c6efd6dfc0c4b94ecceeefc')],
 'summary': 'JFK Gets Nukes Out of Cuba.'}

In [255]:
events = db.event

In [256]:
events.insert_one(cmc)

<pymongo.results.InsertOneResult at 0x13c052a48>

In [260]:
def create_event(url, role_list, summary, title):

    roles = []
    mongo_ids = []
    clean_doc = ''
    
    event_art = requests.get(url)

    
    soup = BeautifulSoup(event_art.text, 'html.parser')

    doc = ''

    for tag in soup.select('#bodyContent'):
        doc = doc + tag.get_text() + ' '

    clean_doc = clean(doc)
    clean_doc
    
    roles = get_person_obj(role_list)

    for person in roles:
        try:
            mongo_ids.append(casting_pool.insert_one(person).inserted_id)
        except:
            print(f'Could not insert {person} into database')
            pass
    
    event = {'title': title
             , 'roles': mongo_ids
             , 'event_doc': clean_doc
             , 'summary': summary}
    
    try:
        events.insert_one(event)
    except:
        print(f'Failed to insert {event} into MongoDB')
              
    
    
        
    
    
        

In [277]:
url = 'https://en.wikipedia.org/wiki/Super_Bowl_XXXVIII_halftime_show_controversy'

In [278]:
cast_list = ['https://en.wikipedia.org/wiki/Janet_Jackson'
             ,'https://en.wikipedia.org/wiki/Justin_Timberlake'
            ,'https://en.wikipedia.org/wiki/Kid_Rock'
            ,'https://en.wikipedia.org/wiki/Greg_Gumbel'
            ,'https://en.wikipedia.org/wiki/Tom_Brady']

In [279]:
summary = 'America Panics at Sight of Nipple'

In [280]:
title = 'Nipplegate'

In [281]:
create_event(url,cast_list,summary,title)