# Web Scraping
- Ethics
- Sending requests
- Parsing HTML with Beautiful Soup
- Crash course in HTML/CSS
- Using the browser inspector to get a CSS Selector for items

In [1]:
from requests import get
from bs4 import BeautifulSoup

In [2]:
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the python-requests default user-agent
response = get(url, headers=headers)

In [3]:
# response.text is a single string of 
# going to that url in your browser then clicking "View Source"
response.text[:100]

'<!DOCTYPE html><html lang="en-US"><head ><meta charset="UTF-8" /><meta name="viewport" content="widt'

In [4]:
response.content[:100]

b'<!DOCTYPE html><html lang="en-US"><head ><meta charset="UTF-8" /><meta name="viewport" content="widt'

In [5]:
# Making the string of HTML into soup is critical
soup = BeautifulSoup(response.content, 'html.parser')

# The type is a BeautifulSoup object
# BeautifulSoup object provides methods and properties we can use
type(soup)

In [6]:
# soup.element.text
soup.title.text

'Codeup’s Data Science Career Accelerator is Here! - Codeup'

In [7]:
print(soup.prettify()[0:100])

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-


`<h1 class="jupiterx-post-title" itemprop="headline">Codeup’s Data Science Career Accelerator is Here!</h1>`
- h1 is the element's name. short for heading 1, h2, h3, h4, h5, h6 ...
- class is something called an attribute
- Attributes are used to describe elements

In [8]:
# Soup.element_name gives us the element
soup.h1

<h1 class="jupiterx-post-title" itemprop="headline">Codeup’s Data Science Career Accelerator is Here!</h1>

In [9]:
# Soup.element_name.text gets the text
soup.h1.text

'Codeup’s Data Science Career Accelerator is Here!'

In [10]:
# soup.element gives back the first matching element
soup.a

<a class="jupiterx-a11y jupiterx-a11y-skip-navigation-link" href="#jupiterx-primary">Skip to content</a>

In [11]:
soup.find_all("a")[:4]

[<a class="jupiterx-a11y jupiterx-a11y-skip-navigation-link" href="#jupiterx-primary">Skip to content</a>,
 <a class="raven-site-logo-link" href="https://codeup.com"> <noscript><img alt="Codeup" class="raven-site-logo-desktop raven-site-logo-tablet raven-site-logo-mobile" data-no-lazy="1" sizes="(max-width: 226px) 100vw, 226px" src="https://750092.smushcdn.com/1449913/wp-content/uploads/2018/08/logo.png?lossy=1&amp;strip=1&amp;webp=1" srcset="https://750092.smushcdn.com/1449913/wp-content/uploads/2018/08/logo.png

In [12]:
# Brings back the footer element.
# soup.footer

In [13]:
# Our new friend is .select
# .select allows us to pass in a CSS selector as a string

datetime = soup.select("header > ul > li.jupiterx-post-meta-date.list-inline-item > time")
datetime

[<time datetime="2018-09-30T05:26:22+00:00" itemprop="datePublished">September 30, 2018</time>]

In [14]:
link = soup.select("div > div.jupiterx-post-content.clearfix > p:nth-child(1) > strong > a")
link = link[0]

In [15]:
link.text

'Glassdoor’s #1 Best Job in America'

In [16]:
# Treat it like a dictionary to get the link URL
link["href"]

'https://www.glassdoor.com/List/Best-Jobs-in-America-LST_KQ0,20.htm'

In [17]:
page = """
<html>
    <head>
        <title>This is the title of the page</title>
    </head>
    <body>
        <heading>
            <h1>Welcome to the blog!</h1>
            <p>Blog is short for "back-log"</p>
        </heading>
        <main>
            <h2>Read your way to insight!</h2>
            <section id="posts">
                <article class="blog_post">
                    <h3>Hello World</h3>
                    <p>This is the first post!</p>
                </article>
                <article class="blog_post">
                    <h3>HTML Is Awesome</h3>
                    <p>It's the language and structure for the web!</p>
                </article>
                <article class="blog_post">
                    <h3>CSS Is Totally Rad</h3>
                    <p>CSS Selectors are super powerful</p>
                </article>
            </section>
        </main>
        <footer>
            <p>All rights reserved.</p>
        </footer>
    </body>
</html>
"""

In [18]:
soup2 = BeautifulSoup(page, 'html.parser')

In [19]:
soup2.title

<title>This is the title of the page</title>

In [20]:
# class selector is .
# . means class in CSS
soup2.select(".blog_post")

[<article class="blog_post">
 <h3>Hello World</h3>
 <p>This is the first post!</p>
 </article>,
 <article class="blog_post">
 <h3>HTML Is Awesome</h3>
 <p>It's the language and structure for the web!</p>
 </article>,
 <article class="blog_post">
 <h3>CSS Is Totally Rad</h3>
 <p>CSS Selectors are super powerful</p>
 </article>]

In [21]:
# id selector is #
soup2.select("#posts")

[<section id="posts">
 <article class="blog_post">
 <h3>Hello World</h3>
 <p>This is the first post!</p>
 </article>
 <article class="blog_post">
 <h3>HTML Is Awesome</h3>
 <p>It's the language and structure for the web!</p>
 </article>
 <article class="blog_post">
 <h3>CSS Is Totally Rad</h3>
 <p>CSS Selectors are super powerful</p>
 </article>
 </section>]

In [22]:
type(soup2.main)

bs4.element.Tag

In [23]:
soup2.main.h2

<h2>Read your way to insight!</h2>

In [24]:
soup2.main.select("article")

[<article class="blog_post">
 <h3>Hello World</h3>
 <p>This is the first post!</p>
 </article>,
 <article class="blog_post">
 <h3>HTML Is Awesome</h3>
 <p>It's the language and structure for the web!</p>
 </article>,
 <article class="blog_post">
 <h3>CSS Is Totally Rad</h3>
 <p>CSS Selectors are super powerful</p>
 </article>]

In [25]:
# Inshorts example

In [26]:
url = 'https://inshorts.com/en/read/business'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the python-requests default user-agent
response = get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [27]:
articles = soup.select(".news-card")

In [28]:
len(articles)

25

In [29]:
article = articles[0]

In [30]:
# first span in this div with class news-card
# With element tags (in soup), we can treat them like dictionaries
article.span["itemid"]

'https://inshorts.com/en/news/uber-reduces-its-india-workforce-by-25-fires-600-employees-1590464141274'

In [31]:
# a[target="_blank"] 
title = article.select("span[itemprop='headline']")
title = title[0].text
title

'Uber India fires 600 employees reducing 25% of its workforce'

In [32]:
body = article.select("div[itemprop='articleBody']")
body = body[0].text
body

"Uber is firing 600 employees in India, or 25% of its workforce in the country, amid the coronavirus pandemic. The layoffs, which are a part of Uber's global restructuring plan that eliminated 6,700 jobs, affect teams across customer and driver support, business development, legal, finance, policy and marketing verticals. Affected staff will be paid 10-12 weeks of salary, Uber said."