# Intro to Web Scraping
- Use `requests` to download the HTML
- Use `BeautifulSoup` to parse that HTML to get the thing(s) you need

## Process
- Step 1: use the `request` library to make an HTTP request across the web
- Step 2: use the `reponse.text` property on the `response` object to get the text of the HTML

In [1]:
from requests import get
from bs4 import BeautifulSoup

In [2]:
url = "https://site-to-scrape.glitch.me"

In [3]:
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
response = get(url, headers=headers)

response

<Response [200]>

In [4]:
response.content

b'<!DOCTYPE html>\n<html lang="en">\n  <head>\n    <title>Site to Scrape!</title>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    \n    <!-- import the webpage\'s stylesheet -->\n    <link rel="stylesheet" href="/style.css">\n    \n    <!-- import the webpage\'s javascript file -->\n    <script src="/script.js" defer></script>\n  </head>  \n  <body>\n    <header>\n      <h1>This is the header!</h1>\n      <hr>\n    </header>\n    \n    <main>\n      <div>\n        <h1 class="first">\n        This is the main\n        </h1>\n        <h2>\n          This is an h2 of main\n        </h2>\n        <h3>\n          H3 inside of first div inside of main\n        </h3>\n      </div>\n      <div>\n        <h3 class="first">\n          H3 inside of second div inside of main.\n        </h3>\n        <p>\n          Here\'s some text content for us to scrape! \xf0\x9f\x91\xbd\n      

In [5]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
type(soup)

bs4.BeautifulSoup

In [7]:
type(soup.title)

bs4.element.Tag

In [8]:
soup.title

<title>Site to Scrape!</title>

In [9]:
soup.title.text

'Site to Scrape!'

In [10]:
soup.h1

<h1>This is the header!</h1>

In [11]:
# Returns the first match only with this dot syntax
soup.h2

<h2>
          This is an h2 of main
        </h2>

In [12]:
soup.text

"\n\n\nSite to Scrape!\n\n\n\n\n\n\n\n\n\n\nThis is the header!\n\n\n\n\n\n        This is the main\n        \n\n          This is an h2 of main\n        \n\n          H3 inside of first div inside of main\n        \n\n\n\n          H3 inside of second div inside of main.\n        \n\n          Here's some text content for us to scrape! 👽\n        \n\n          Here's another paragraph of content! ☠️\n        \nClick here to visit my portfolio\n\n\n\nThis is the footer\n\n\n\n\n"

In [13]:
# soup.element returns a beautifulsoup Tag object
soup.h2.text

'\n          This is an h2 of main\n        '

In [14]:
soup.h2.text.strip()

'This is an h2 of main'

In [15]:
soup.h2.text.strip()[-5:]

' main'

In [16]:
# find_all returns a result set, which is like a list, but has more BeautifulSoup functionality
soup.find_all("h3")

[<h3>
           H3 inside of first div inside of main
         </h3>,
 <h3 class="first">
           H3 inside of second div inside of main.
         </h3>]

In [17]:
type(soup.find_all("h3")[0])

bs4.element.Tag

In [18]:
soup.find_all("h3")[0]

<h3>
          H3 inside of first div inside of main
        </h3>

In [19]:
# BeautifulSoup Tag element
soup.find_all("h3")[0].text

'\n          H3 inside of first div inside of main\n        '

In [20]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<title>Site to Scrape!</title>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<!-- import the webpage's stylesheet -->
<link href="/style.css" rel="stylesheet"/>
<!-- import the webpage's javascript file -->
<script defer="" src="/script.js"></script>
</head>
<body>
<header>
<h1>This is the header!</h1>
<hr/>
</header>
<main>
<div>
<h1 class="first">
        This is the main
        </h1>
<h2>
          This is an h2 of main
        </h2>
<h3>
          H3 inside of first div inside of main
        </h3>
</div>
<div>
<h3 class="first">
          H3 inside of second div inside of main.
        </h3>
<p>
          Here's some text content for us to scrape! 👽
        </p>
<p>
          Here's another paragraph of content! ☠️
        </p>
<a href="https://github.com/ryanorsinger">Click here to visit my portfolio</a>
</div>
</main>
<footer>
<h1>This 

In [21]:
type(soup.select("p"))

bs4.element.ResultSet

In [22]:
soup.select("p")

[<p>
           Here's some text content for us to scrape! 👽
         </p>,
 <p>
           Here's another paragraph of content! ☠️
         </p>]

In [23]:
soup.select_one("a")

<a href="https://github.com/ryanorsinger">Click here to visit my portfolio</a>

In [24]:
# .select will return a ResultSet even if there's only one of something
soup.select("a")

[<a href="https://github.com/ryanorsinger">Click here to visit my portfolio</a>]

In [25]:
soup.select("img")

[<img alt="" aria-hidden="true" src="https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&amp;color=black" style="vertical-align: bottom;"/>]

In [26]:
soup.select("h5")

[]

In [27]:
type(soup.select("body"))

bs4.element.ResultSet

In [28]:
type(soup.select("body")[0])

bs4.element.Tag

In [29]:
soup.select("body")[0].text

"\n\nThis is the header!\n\n\n\n\n\n        This is the main\n        \n\n          This is an h2 of main\n        \n\n          H3 inside of first div inside of main\n        \n\n\n\n          H3 inside of second div inside of main.\n        \n\n          Here's some text content for us to scrape! 👽\n        \n\n          Here's another paragraph of content! ☠️\n        \nClick here to visit my portfolio\n\n\n\nThis is the footer\n\n\n"

In [30]:
soup.select_one("footer")

<footer>
<h1>This is the footer</h1>
<img alt="" aria-hidden="true" src="https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&amp;color=black" style="vertical-align: bottom;"/>
</footer>

In [31]:
soup.select_one("footer").text

'\nThis is the footer\n\n'

In [32]:
soup.select_one("footer").img

<img alt="" aria-hidden="true" src="https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&amp;color=black" style="vertical-align: bottom;"/>

In [33]:
# Use dictionary syntax to access the attribute values
soup.select_one("footer").img["src"]

'https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&color=black'

In [34]:
soup.select_one("a")

<a href="https://github.com/ryanorsinger">Click here to visit my portfolio</a>

In [35]:
url = soup.select_one("a")["href"]
url

'https://github.com/ryanorsinger'

In [36]:
response2 = get(url, headers=headers)

In [37]:
github_soup = BeautifulSoup(response2.content, 'html.parser')
github_soup


<!DOCTYPE html>

<html data-a11y-animated-images="system" data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="" href="https://github.githubassets.com" rel="preconnect"/>
<link href="https://avatars.githubusercontent.com" rel="preconnect"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/light-92c7d381038e.css" integrity="sha512-ksfTgQOOnE+FFXf+yNfVjKSlEckJAdufFIYGK7ZjRhWcZgzAGcmZqqArTgMLpu90FwthqcCX4ldDgKXbmVMeuQ==" media="all" rel="stylesheet"><link crossorigin="anonymous" href="https://github.githubassets.com/assets/dark-d4a90c367f0c.css" integrity="sha512-1KkMNn8M/al/dtzBLupRwkIOgnA9MWkm8oxS+sol

In [38]:
len(github_soup.select("a"))

145

In [39]:
# How to get all of the URLs from each link
anchors = github_soup.select("a")
urls = []
for a in anchors:
    # To access an HTML tag's attribute, use dictionary syntax
    href = a["href"]
    urls.append(href)

urls

['#start-of-content',
 'https://github.com/',
 '/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E&source=header',
 '/features',
 '/mobile',
 '/features/actions',
 '/features/codespaces',
 '/features/copilot',
 '/features/packages',
 '/features/security',
 '/features/code-review',
 '/features/issues',
 '/features/discussions',
 '/features/integrations',
 '/sponsors',
 '/customer-stories',
 '/team',
 '/enterprise',
 '/explore',
 '/topics',
 '/collections',
 '/trending',
 'https://skills.github.com/',
 '/sponsors/explore',
 'https://opensource.guide',
 '/readme',
 '/events',
 'https://github.community',
 'https://education.github.com',
 'https://stars.github.com',
 '/marketplace',
 '/pricing',
 '/pricing#compare-features',
 'https://github.com/enterprise/contact',
 'https://education.github.com',
 '',
 '',
 '',
 '',
 '/login?return_to=https%3A%2F%2Fgithub.com%2Fryanorsinger',
 '/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E&source

In [40]:
response2.url

'https://github.com/ryanorsinger'

In [41]:
# .select and .select_one take CSS selectors
# elements themselves are a string of that element
soup.select("p")

[<p>
           Here's some text content for us to scrape! 👽
         </p>,
 <p>
           Here's another paragraph of content! ☠️
         </p>]

In [42]:
soup.select("a")

[<a href="https://github.com/ryanorsinger">Click here to visit my portfolio</a>]

In [43]:
# .class_name the . means "hey, I'm looking for a class"
soup.select(".first")

[<h1 class="first">
         This is the main
         </h1>,
 <h3 class="first">
           H3 inside of second div inside of main.
         </h3>]

In [44]:
github_soup.select("a.Link--primary")[0]["href"]

'/features'

In [45]:
baby_shark_selector = "#js-pjax-container > div.container-xl.px-3.px-md-4.px-lg-5 > div > div.Layout-sidebar > div > div.js-profile-editable-replace > div:nth-child(4) > div.d-flex.flex-wrap > a:nth-child(2)"
github_soup.select(baby_shark_selector)[0]["href"]

'/ryanorsinger?achievement=pull-shark&tab=achievements'

In [46]:
github_soup.select("#user-repositories-list > ul > li:nth-child(1) > div.col-2.d-flex.flex-column.flex-justify-around.flex-items-end.ml-3 > div.text-right.hide-lg.hide-md.hide-sm.hide-xs.flex-self-end")

[]

In [47]:
organizations_soup = github_soup.select("#js-pjax-container > div.container-xl.px-3.px-md-4.px-lg-5 > div > div.Layout-sidebar > div > div.js-profile-editable-replace > div.border-top.color-border-muted.pt-3.mt-3.clearfix.hide-sm.hide-md")

In [48]:
type(organizations_soup[0])

bs4.element.Tag

In [49]:
organizations_soup[0]

<div class="border-top color-border-muted pt-3 mt-3 clearfix hide-sm hide-md">
<h2 class="mb-2 h4">Organizations</h2>
<a aria-label="gocodeup" class="avatar-group-item" data-hovercard-type="organization" data-hovercard-url="/orgs/gocodeup/hovercard" data-hydro-click='{"event_type":"user_profile.click","payload":{"profile_user_id":5657581,"target":"MEMBER_ORGANIZATION_AVATAR","user_id":null,"originating_url":"https://github.com/ryanorsinger"}}' data-hydro-click-hmac="c16b9c4bfd966ebf5b186126d488690654514e759552a70932125d0816d1601a" data-octo-click="hovercard-link-click" data-octo-dimensions="link_type:self" href="/gocodeup" itemprop="follows">
<img alt="@gocodeup" class="avatar" data-view-component="true" height="32" size="32" src="https://avatars.githubusercontent.com/u/6238347?s=64&amp;v=4" width="32"/>
</a> <a aria-label="codeup-ad-lister" class="avatar-group-item" data-hovercard-type="organization" data-hovercard-url="/orgs/codeup-ad-lister/hovercard" data-hydro-click='{"event_type"

In [50]:
organizations_soup[0].h2

<h2 class="mb-2 h4">Organizations</h2>

In [51]:
organizations_soup[0].h2.text

'Organizations'

In [52]:
organizations_soup[0].select("a")

[<a aria-label="gocodeup" class="avatar-group-item" data-hovercard-type="organization" data-hovercard-url="/orgs/gocodeup/hovercard" data-hydro-click='{"event_type":"user_profile.click","payload":{"profile_user_id":5657581,"target":"MEMBER_ORGANIZATION_AVATAR","user_id":null,"originating_url":"https://github.com/ryanorsinger"}}' data-hydro-click-hmac="c16b9c4bfd966ebf5b186126d488690654514e759552a70932125d0816d1601a" data-octo-click="hovercard-link-click" data-octo-dimensions="link_type:self" href="/gocodeup" itemprop="follows">
 <img alt="@gocodeup" class="avatar" data-view-component="true" height="32" size="32" src="https://avatars.githubusercontent.com/u/6238347?s=64&amp;v=4" width="32"/>
 </a>,
 <a aria-label="codeup-ad-lister" class="avatar-group-item" data-hovercard-type="organization" data-hovercard-url="/orgs/codeup-ad-lister/hovercard" data-hydro-click='{"event_type":"user_profile.click","payload":{"profile_user_id":5657581,"target":"MEMBER_ORGANIZATION_AVATAR","user_id":null,"

In [53]:
urls = []
for a in organizations_soup[0].select("a"):
    # use dictionary syntax to access attribute values
    urls.append(a["href"])
urls

['/gocodeup', '/codeup-ad-lister']