# Web Scraping

In [55]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [34]:
url = 'https://web-scraping-demo.zgulde.net/news'
response = get(url)
response

<Response [200]>

In [35]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>News Example Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap


Guidance for web scraping:

1. View the page source
1. Right click -> inspect the contents that you care about
1. play around with the browser inspector, make note of the html structure surrounding the elements of interest
1. Identify any repeated structure
1. Break off a small chunk and extract the info
1. Turn your code into a function and apply it to the rest 

In [37]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [41]:
articles = soup.select('div.grid.grid-cols-4')

In [44]:
article = articles[0]
article

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">evidence just rest</h2>
<div class="grid grid-cols-2 italic">
<p> 1976-01-10 </p>
<p class="text-right">By Janet Cardenas </p>
</div>
<p>Along we yeah commercial see.
Draw choice head accept. Home miss listen best hour dinner agreement. Director short order better really article.</p>
</div>
</div>

In [80]:
def parse_news_article(article):
    output = {}
    output['headline'] = article.find('h2').text
    output['date'], output['byline'], output['description'] = [p.text.strip() for p in article.find_all('p')]
    return output

In [81]:
pd.DataFrame([parse_news_article(article) for article in articles])

Unnamed: 0,headline,date,byline,description
0,evidence just rest,1976-01-10,By Janet Cardenas,Along we yeah commercial see.\nDraw choice hea...
1,prove Republican drop,2007-06-27,By Michael Wade,Against travel benefit general model differenc...
2,yeah claim least,1992-05-22,By Walter Wagner,Exist senior east think computer other red. Ca...
3,direction how decision,2003-10-01,By Roy Simpson,And much owner five seat. Stay why serve alrea...
4,pull this place,2010-07-17,By Michael Gillespie,Sort put animal able lay week. Miss guy toward...
5,benefit wide maintain,2015-02-24,By Brenda Rodriguez,Without area character certainly not change to...
6,process prevent building,1978-02-16,By Darrell Miranda,Seat test maintain. Choose onto performance st...
7,take others paper,2012-04-12,By Jordan Wolfe,Your write choose along prove may mission. Lea...
8,Congress stand not,1992-11-19,By Joseph Gardner,Guess traditional man history situation econom...
9,truth stay add,1973-07-16,By Mitchell Hawkins,Upon director nation us nearly cover. Despite ...


In [58]:
url = 'https://web-scraping-demo.zgulde.net/people'
response = get(url)
soup = BeautifulSoup(response.text)
people = soup.select('div.person')
people[0]

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">David Fleming</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Networked multi-state budgetary management"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">oramirez@davis.net</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">925.076.5159</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                018 Paula Views <br/>
                East Caitlin, NE 75349
            </p>
</div>
</div>

In [74]:
def parse_person(person):
    output = {}
    output['name'] = person.find('h2').text
    output['quote'] = person.select('.quote')[0].text.strip()
    output['email_address'] = person.select_one('.email').text
    output['phone'] = person.select_one('.phone').text
    output['address'] = person.select_one('.address').text.strip()
    return output

In [79]:
pd.DataFrame([parse_person(person) for person in people])

Unnamed: 0,name,quote,email_address,phone,address
0,David Fleming,"""Networked multi-state budgetary management""",oramirez@davis.net,925.076.5159,018 Paula Views \n East Caitlin...
1,Felicia Roberts,"""Object-based empowering time-frame""",obowman@lutz.com,(006)263-1679x9640,10351 Jacqueline Centers \n Mar...
2,Steven Black,"""Proactive scalable intranet""",steven59@hotmail.com,(778)069-0682x38835,800 Jose Highway \n South Brend...
3,Robert Morgan,"""User-centric client-driven forecast""",silvaamanda@rasmussen-rivera.biz,446.593.2812x8804,"0676 Morgan Creek \n Millsland,..."
4,Michael Walters,"""Ameliorated 6thgeneration knowledgebase""",acostajim@gmail.com,(941)577-2633x25171,"6010 Olsen Hills \n West Billy,..."
5,Katherine Campbell,"""Reverse-engineered bottom-line installation""",wbrown@moore.com,070.472.9803,413 Dennis Ports Suite 037 \n E...
6,James Lee,"""Multi-layered neutral hub""",newmanlinda@grant.net,(913)264-2229,29918 Alvarez Creek Apt. 453 \n ...
7,Elizabeth Gonzales,"""Ergonomic clear-thinking leverage""",riceseth@yahoo.com,991.722.7089,4133 Taylor Avenue Apt. 376 \n ...
8,Kelsey Olson,"""Monitored upward-trending neural-net""",brownamy@kim.org,001-023-012-4857x561,"173 Joseph Green \n Jacobville,..."
9,Jennifer Walker,"""Assimilated mobile middleware""",elizabethlozano@hotmail.com,522-116-5581,910 Haynes Ramp Apt. 796 \n Eas...


In [None]:
# find vs select
# .find is a beautiful soup method
# .select allows you to use a CSS selector
person = people[0]

In [88]:
person.find('p', class_='phone')

<p class="phone col-span-8">925.076.5159</p>

In [87]:
person.find_all('p', class_='phone')

[<p class="phone col-span-8">925.076.5159</p>]

In [89]:
person.select('p.phone')

[<p class="phone col-span-8">925.076.5159</p>]

In [90]:
person.select_one('p.phone')

<p class="phone col-span-8">925.076.5159</p>