# Web Scraping
- Ethics
- Sending requests
- Parsing HTML with Beautiful Soup
- Crash course in HTML/CSS
- Using the browser inspector to get a CSS Selector for items

In [1]:
from requests import get
from bs4 import BeautifulSoup

In [2]:
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the python-requests default user-agent
response = get(url, headers=headers)

In [3]:
# response.text is a single string of 
# going to that url in your browser then clicking "View Source"
response.text[:100]

'<!DOCTYPE html><html lang="en-US"><head ><meta charset="UTF-8" /><meta name="viewport" content="widt'

In [4]:
response.content[:100]

b'<!DOCTYPE html><html lang="en-US"><head ><meta charset="UTF-8" /><meta name="viewport" content="widt'

In [5]:
# Making the string of HTML into soup is critical
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
soup

<!DOCTYPE html>
var wp_konami_code_js_local = {"wp_konami_code_page":"https:\/\/codeup.com\/konami-easter-egg\/"};
/*  */</script> <script type="text/javascript">if(navigator.userAgent.match(/x11.*fox\/54|oid\s4.*xus.*ome\/62|oobot|ighth|tmetr|eadles|ingdo/i)){document.write("<script defer type='text\/javascript' src='https:\/\/codeup.com\/wp-content\/uploads\/cache\/fvm\/1590183537\/out\/header-4db8306302c124bf705a7a7bbfbac1513bbae6bd.min.js'><\/script>");}else{document.write("<script type='text\/javascript' src='https:\/\/codeup.com\/wp-content\/uploads\/cache\/fvm\/1590183537\/out\/header-4db8306302c124bf705a7a7bbfbac1513bbae6bd.min.js'><\/script>");}</script><link href="https://codeup.com/wp-json/" rel="https://api.w.org/"/><link href="https://codeup.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fcodeup.com%2Fcodeups-data-science-career-accelerator-is-here%2F" rel="alternate" type="application/json+oembed"/><link href="https://codeup.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fc

# The type is a BeautifulSoup object
# BeautifulSoup object provides methods and properties we can use
type(soup)

In [7]:
# soup.element.text
soup.title.text

'Codeup’s Data Science Career Accelerator is Here! - Codeup'

In [8]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <script>
   function fvmuag(){if(navigator.userAgent.match(/x11.*fox\/54|oid\s4.*xus.*ome\/62|oobot|ighth|tmetr|eadles|ingdo/i))return!1;if(navigator.userAgent.match(/x11.*ome\/75\.0\.3770\.100/i)){var e=screen.width,t=screen.height;if("number"==typeof e&&"number"==typeof t&&862==t&&1367==e)return!1}return!0}
  </script>
  <style id="aoatfcss" media="all">
   <style>div{box-sizing:border-box}div{box-sizing:border-box}@font-face{font-family:FontAwesome;src:url(//codeup.com/wp-content/plugins/elementor/assets/lib/font-awesome/css/../fonts/fontawesome-webfont.eot?#iefix&v=4.7.0) format('embedded-opentype'),url(//codeup.com/wp-content/plugins/elementor/assets/lib/font-awesome/css/../fonts/fontawesome-webfont.woff2?v=4.7.0) format('woff2'),url(//codeup.com/wp-content/plugins/elementor/assets/lib/font-awesome/css/../fonts/fontawesome-webfont.woff?v=4

- this
- that
- the other

`<h1 class="jupiterx-post-title" itemprop="headline">Codeup’s Data Science Career Accelerator is Here!</h1>`
- h1 is the element's name. short for heading 1, h2, h3, h4, h5, h6 ...
- class is something called an attribute
- Attributes are used to describe elements

In [9]:
# Soup.element_name gives us the element
soup.h1

<h1 class="jupiterx-post-title" itemprop="headline">Codeup’s Data Science Career Accelerator is Here!</h1>

In [10]:
# Soup.element_name.text gets the text
soup.h1.text

'Codeup’s Data Science Career Accelerator is Here!'

In [11]:
# soup.element gives back the first matching element
soup.a

<a class="jupiterx-a11y jupiterx-a11y-skip-navigation-link" href="#jupiterx-primary">Skip to content</a>

In [12]:
soup.find_all("a")

[<a class="jupiterx-a11y jupiterx-a11y-skip-navigation-link" href="#jupiterx-primary">Skip to content</a>,
 <a class="raven-site-logo-link" href="https://codeup.com"> <noscript><img alt="Codeup" class="raven-site-logo-desktop raven-site-logo-tablet raven-site-logo-mobile" data-no-lazy="1" sizes="(max-width: 226px) 100vw, 226px" src="https://750092.smushcdn.com/1449913/wp-content/uploads/2018/08/logo.png?lossy=1&amp;strip=1&amp;webp=1" srcset="https://750092.smushcdn.com/1449913/wp-content/uploads/2018/08/logo.png

In [13]:
# Brings back the footer element.
# soup.footer

In [18]:
# Our new friend is .select
# .select allows us to pass in a CSS selector as a string

datetime = soup.select("header > ul > li.jupiterx-post-meta-date.list-inline-item > time")
datetime

[<time datetime="2018-09-30T05:26:22+00:00" itemprop="datePublished">September 30, 2018</time>]

In [28]:
link = soup.select("div > div.jupiterx-post-content.clearfix > p:nth-child(1) > strong > a")
link = link[0]

In [30]:
link.text

'Glassdoor’s #1 Best Job in America'

In [31]:
# Treat it like a dictionary to get the link URL
link["href"]

'https://www.glassdoor.com/List/Best-Jobs-in-America-LST_KQ0,20.htm'

In [32]:
page = """
<html>
    <head>
        <title>This is the title of the page</title>
    </head>
    <body>
        <heading>
            <h1>Welcome to the blog!</h1>
            <p>Blog is short for "back-log"</p>
        </heading>
        <main>
            <h2>Read your way to insight!</h2>
            <section id="posts">
                <article class="blog_post">
                    <h3>Hello World</h3>
                    <p>This is the first post!</p>
                </article>
                <article class="blog_post">
                    <h3>HTML Is Awesome</h3>
                    <p>It's the language and structure for the web!</p>
                </article>
                <article class="blog_post">
                    <h3>CSS Is Totally Rad</h3>
                    <p>CSS Selectors are super powerful</p>
                </article>
            </section>
        </main>
        <footer>
            <p>All rights reserved.</p>
        </footer>
    </body>
</html>
"""

In [33]:
soup2 = BeautifulSoup(page, 'html.parser')

In [34]:
soup2.title

<title>This is the title of the page</title>

In [39]:
# class selector is .
# . means class in CSS
soup2.select(".blog_post")

[<article class="blog_post">
 <h3>Hello World</h3>
 <p>This is the first post!</p>
 </article>,
 <article class="blog_post">
 <h3>HTML Is Awesome</h3>
 <p>It's the language and structure for the web!</p>
 </article>,
 <article class="blog_post">
 <h3>CSS Is Totally Rad</h3>
 <p>CSS Selectors are super powerful</p>
 </article>]

In [40]:
# id selector is #
soup2.select("#posts")

[<section id="posts">
 <article class="blog_post">
 <h3>Hello World</h3>
 <p>This is the first post!</p>
 </article>
 <article class="blog_post">
 <h3>HTML Is Awesome</h3>
 <p>It's the language and structure for the web!</p>
 </article>
 <article class="blog_post">
 <h3>CSS Is Totally Rad</h3>
 <p>CSS Selectors are super powerful</p>
 </article>
 </section>]

In [36]:
type(soup2.main)

bs4.element.Tag

In [37]:
soup2.main.h2

<h2>Read your way to insight!</h2>

In [38]:
soup2.main.select("article")

[<article class="blog_post">
 <h3>Hello World</h3>
 <p>This is the first post!</p>
 </article>,
 <article class="blog_post">
 <h3>HTML Is Awesome</h3>
 <p>It's the language and structure for the web!</p>
 </article>,
 <article class="blog_post">
 <h3>CSS Is Totally Rad</h3>
 <p>CSS Selectors are super powerful</p>
 </article>]