# Live Code Demo of Web Scraping w/ Beautiful Soup
- By: Monica Puerto 
- (slight edits throughout and some commentary by Paul Jeffries)

In [1]:
# import the necessary packages

import requests
from bs4 import BeautifulSoup
import json
import time

#import this package if you have encoding errors
# import sys
# sys.setdefaultencoding('utf8')

In [2]:
# extract the HTML into text 
r = requests.get('https://www.imdb.com/title/tt0071853/')
# print the response received
print(r)

<Response [200]>


In [3]:
# gets the unparsed text itself
# we won't print this because it would be super slow
r_unparsed = r.text

In [4]:
# we'll use the time package to figure out how long the query took
# and we'll use the beautiful soup package to scrape the unparsed info from the website

# start the clock
start = time.time()
b = BeautifulSoup(r_unparsed,'lxml')
# stop the clock
end = time.time()
# as we can see, it took less than a second
print(end - start)
# had we used 'html-parser' instead of 'lxml' it would have taken longer

0.10545992851257324


In [5]:
# extract the title and save it into a variable
# there are 2 methods to find the title of a page when it comes to beautiful soup
# you can use some specific methods from Beautiful Soup, or follow the tree
title = b.title.text
print(title)
print(b.find('h1').text)

# if there was more than 1 title, you would have to use the method below
title = b.find_all('title')
print(title)

Monty Python and the Holy Grail (1975) - IMDb
Monty Python and the Holy Grail (1975) 
[<title>Monty Python and the Holy Grail (1975) - IMDb</title>, <title>TryIMDbProFree</title>]


In [6]:
# NB: the way that you would find most of these tags--i.e. "summary_text"--is via inspect element
# this can be done in Chrome, firefox, etc.

# extract the description of the movie and save it into a variable
desc = b.find('div','summary_text').text.strip()
print(desc)

# extract the Rating eg: R and save into a variable
print(b.find('div','subtext').text.strip()[0:2])

King Arthur and his Knights of the Round Table embark on a surreal, low-budget search for the Holy Grail, encountering many, very silly obstacles.
PG


In [7]:
# here's another method for getting at the various sub-components like the rating, the list of actors, etc.
rating = json.loads(b.find('script', type='application/ld+json').text)['contentRating']

## extract the actors 
actors = json.loads(b.find('script', type='application/ld+json').text)['actor']

actors_list = []

for actor in actors:
	actors_list.append(actor[u'name'])

# and now that we've iterated through the actors, let's print the list of actors we returned
print(actors_list)

['Graham Chapman', 'John Cleese', 'Eric Idle', 'Terry Gilliam']


In [8]:
# now we can create some functions to take in an unparsed beautiful soup object and return lists of actors and directors
def actors(x):
	actors_list = []
	actors = json.loads(x.find('script', type='application/ld+json').text)['actor']
	for actor in actors:
		actors_list.append(str(actor['name']))
	return actors_list

def directors(x):
	directors = json.loads(x.find('script', type='application/ld+json').text)['director']
	directors_list = []
	for director in directors:
		directors_list.append(str(director['name']))
	return directors_list

print(actors(b))
print(directors(b))

['Graham Chapman', 'John Cleese', 'Eric Idle', 'Terry Gilliam']
['Terry Gilliam', 'Terry Jones']


In [10]:
# create a function that extracts this information of any IMDB movie of your choosing into the form of a dictionary 

def movie_info(id):
	r = requests.get('https://www.imdb.com/title/{0}/'.format(id))
	b = BeautifulSoup(r.text,'lxml')
	movie_dict = {}
	movie_dict[id] = {}
	movie_dict[id]['title'] = b.title.text
	movie_dict[id]['desc'] = b.find('div','summary_text').text.strip()
	movie_dict[id]['rating'] = json.loads(b.find('script', type='application/ld+json').text)['contentRating']
	movie_dict[id]['actors'] = actors(b)
	return movie_dict

Adrift = movie_info('tt6306064')
print(Adrift)

{'tt6306064': {'title': 'Adrift (2018) - IMDb', 'desc': "A true story of survival, as a young couple's chance encounter leads them first to love, and then on the adventure of a lifetime as they face one of the most catastrophic hurricanes in recorded history.", 'rating': 'PG-13', 'actors': ['Shailene Woodley', 'Sam Claflin', 'Jeffrey Thomas', 'Elizabeth Hawthorne']}}
