In [1]:
import requests
import bs4

## News Articles

In [7]:
# make the http request and turn the response into a beautiful soup object
response = requests.get('https://web-scraping-demo.zgulde.net/news')
html = response.text
soup = bs4.BeautifulSoup(html)

1. Find the container for the information we want `article_container`
1. Within the container, identify the entities that we want to produce a list
1. Process each individual entity (identify the pieces that we want and extract them)

In [12]:
article_container = soup.select('.grid.gap-y-12')[0]

In [19]:
articles = article_container.select('.grid.grid-cols-4.gap-x-4.border')

In [24]:
article = articles[0]
# get a pretty printed representation of our element
print(article.prettify())

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
  <h2 class="text-2xl text-green-900">
   help human fly
  </h2>
  <div class="grid grid-cols-2 italic">
   <p>
    1989-11-04
   </p>
   <p class="text-right">
    By Stephanie Mendoza
   </p>
  </div>
  <p>
   Song learn day party. Nor again between knowledge.
Reach exist huge. Role each process receive role inside around safe. Simple these understand main specific guy.
  </p>
 </div>
</div>



`.select` vs `.find` or `.find_all`

- `.select` always gives back a list, sometimes the list is empty, sometimes it has a single element, sometimes it has multiple elements
- `.find` and `.find_all` accept a *tag name*
- `.find` returns a single element
- `.find_all` returns a list of elements
- With `.select` multiple class names each have a `.` in front of them
- With `.find` or `.find_all` you can use a `class_` keyword argument, but the classes must match exactly

In [53]:
article.find('div', class_='grid grid-cols-2 italic')

<div class="grid grid-cols-2 italic">
<p> 1989-11-04 </p>
<p class="text-right">By Stephanie Mendoza </p>
</div>

In [41]:
def process_article(article):
    title = article.find('h2').text
    date_and_byline_div = article.select('.grid.grid-cols-2.italic')[0]
    date_p, by_p = date_and_byline_div.find_all('p')
    summary = article.find_all('p')[-1].text
    
    return {
        "title": title,
        "date": date_p.text,
        "by": by_p.text,
        "summary": summary
    }

In [46]:
process_article(articles[3])

{'title': 'energy plan go',
 'date': ' 1973-10-24 ',
 'by': 'By James Serrano ',
 'summary': 'Back certain democratic still. Ready power begin medical security energy.\nPrepare nature hotel Republican see onto statement. Artist foreign space dinner.'}

In [49]:
import pandas as pd


pd.DataFrame([process_article(article) for article in articles])

Unnamed: 0,title,date,by,summary
0,help human fly,1989-11-04,By Stephanie Mendoza,Song learn day party. Nor again between knowle...
1,serious generation vote,2016-08-08,By Amy Collier,Upon watch attention first could not. Religiou...
2,couple hold these,1985-09-04,By Cody Davis,Significant card should whole stage. Part cont...
3,energy plan go,1973-10-24,By James Serrano,Back certain democratic still. Ready power beg...
4,wish may final,2020-10-06,By Ryan Baker,High hard quite approach threat. Feel nice sur...
5,sense plant tend,1979-04-18,By Brian Pugh,Back do team. Sell good strategy third includi...
6,fire down report,2015-10-04,By Andrew Gould,World author popular laugh. Wind message whole...
7,tough history can,1983-04-13,By Stephanie Andrews,Will summer huge blue statement. Reason later ...
8,he start time,1991-12-13,By Alicia Clark,Trip tonight skill garden even mention price. ...
9,matter low including,2016-06-14,By Brian Hodges,Small more rock candidate five without these. ...


## People

In [67]:
response = requests.get('https://web-scraping-demo.zgulde.net/people')
soup = bs4.BeautifulSoup(response.text)
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Amanda Lane</h2

In [77]:
people_div = soup.select('#people')[0]
people = people_div.select('.person')

In [78]:
person = people[0]
print(person.prettify())

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-full border-b">
  Amanda Lane
 </h2>
 <p class="quote col-span-full px-5 py-5 text-center text-gray-500">
  "Polarized asynchronous complexity"
 </p>
 <div class="grid grid-cols-9">
  <i class="bi bi-envelope-fill text-purple-800">
  </i>
  <p class="email col-span-8">
   audreygutierrez@dominguez-williams.com
  </p>
  <i class="bi bi-telephone-fill text-purple-800">
  </i>
  <p class="phone col-span-8">
   5673781598
  </p>
 </div>
 <div class="address grid grid-cols-9">
  <i class="bi bi-geo-fill text-purple-800">
  </i>
  <p class="col-span-8">
   59857 Steven Locks
   <br/>
   Petersonbury, CO 29128
  </p>
 </div>
</div>



In [100]:
def process_person(person):
    return {
        'name': person.find(class_='name').text,
        'quote': person.find(class_='quote').text.strip(),
        'email': person.find(class_='email').text,
        'phone': person.find(class_='phone').text,
        'address': person.find(class_='address').text.strip(),
    }

In [104]:
process_person(people[3])

{'name': 'Deanna Wolfe',
 'quote': '"Advanced 3rdgeneration circuit"',
 'email': 'hmoore@aguirre.org',
 'phone': '330.380.7541x7218',
 'address': '7587 Alexis Roads Apt. 345 \n                Montgomerybury, MA 78718'}

In [106]:
pd.DataFrame([process_person(person) for person in people])

Unnamed: 0,name,quote,email,phone,address
0,Amanda Lane,"""Polarized asynchronous complexity""",audreygutierrez@dominguez-williams.com,5673781598,59857 Steven Locks \n Petersonb...
1,Lynn Garcia,"""Enhanced system-worthy workforce""",keith39@patrick-flowers.com,+1-215-243-0248,89678 Catherine Station Apt. 015 \n ...
2,Robin Griffin,"""Monitored static customer loyalty""",kyork@turner-pineda.com,662.795.1118x677,44452 Dixon Stravenue Apt. 405 \n ...
3,Deanna Wolfe,"""Advanced 3rdgeneration circuit""",hmoore@aguirre.org,330.380.7541x7218,7587 Alexis Roads Apt. 345 \n M...
4,Jamie Morris,"""Object-based contextually-based concept""",scole@wright-cook.info,(321)307-0005x34530,0362 George Spring \n Mccoymout...
5,Michael Long,"""Synchronized clear-thinking orchestration""",sarah10@kent-horne.com,001-167-904-7446x1619,66261 Cooper Cliffs Apt. 491 \n ...
6,Russell Klein,"""Up-sized non-volatile encryption""",shane21@cooper.com,(173)374-0849,821 William Meadows \n North Pa...
7,Kenneth Whitaker,"""Up-sized discrete initiative""",hmiranda@hotmail.com,+1-582-385-9514,65782 Anna Drive Apt. 017 \n Cr...
8,William Orr,"""Expanded 24/7 implementation""",bmiller@wheeler.com,690.590.9303,578 Michelle Light Apt. 557 \n ...
9,David Love,"""Innovative attitude-oriented time-frame""",xlewis@miller-cummings.com,(306)106-0864,3911 Campbell Roads \n Gonzales...
