
<div style="background-color: lightblue; padding: 60px;">
    <h1><b>Acquire Data through Web Scraping
</b></h1>
</div>


In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import acquire as a

# Exercises

# 1. Codeup Blog Articles

Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:


{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}

Plus any additional properties you think might be helpful.

Bonus: Scrape the text of all the articles linked on codeup's blog page.



# Steps to take in order to grab articles:

Get the base blog page url
Figure out how to capture blog link articles from main blog page
Compose blog article links into a list of urls that I can interact with on an individual basis to grab blog article contents
Figure out how to grab the title and article body of a specific blog article
do all of those things in a loop, and turn it into a df

In [4]:
# base codeup blog url
base_url = 'https://codeup.edu/blog/'

In [5]:
# ping that page so I can grab the content
# we learned in the lesson that I need to suggest a user-agent
# for the codeup website to accept a requests library ping
response = requests.get(base_url, headers={'User-Agent': 'giveanythingnews'})


In [24]:
response


<Response [200]>

In [6]:
# check out the top of the content now that we have a 200 (OK)
response.text[:200]

'<!DOCTYPE html>\n<html lang="en-US">\n<head>\n\t<meta charset="UTF-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n\t<link rel="pingback" href="https://codeup.edu/xmlrpc.php" />\n\n\t<script type='

In [7]:
base_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# proceed with getting links to my articles
# this requires page investigation!

In [None]:
# element tag of a:
# class of more-link
# hypothetically I should be able to grab these types of things
# from that page:
# find_all('a', class_='more-link')
# .select('.more-link')

In [8]:
base_soup.find_all('a', class_='more-link')


[<a class="more-link" href="https://codeup.edu/featured/apida-heritage-month/">read more</a>,
 <a class="more-link" href="https://codeup.edu/featured/women-in-tech-panelist-spotlight/">read more</a>,
 <a class="more-link" href="https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/">read more</a>,
 <a class="more-link" href="https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/">read more</a>,
 <a class="more-link" href="https://codeup.edu/events/women-in-tech-madeleine/">read more</a>,
 <a class="more-link" href="https://codeup.edu/codeup-news/panelist-spotlight-4/">read more</a>]

In [9]:
# this appears to work just as well:
base_soup.select('.more-link')


[<a class="more-link" href="https://codeup.edu/featured/apida-heritage-month/">read more</a>,
 <a class="more-link" href="https://codeup.edu/featured/women-in-tech-panelist-spotlight/">read more</a>,
 <a class="more-link" href="https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/">read more</a>,
 <a class="more-link" href="https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/">read more</a>,
 <a class="more-link" href="https://codeup.edu/events/women-in-tech-madeleine/">read more</a>,
 <a class="more-link" href="https://codeup.edu/codeup-news/panelist-spotlight-4/">read more</a>]

In [10]:
blog_links = [element['href'] for element in base_soup.find_all('a', class_='more-link')]


In [11]:
blog_links

['https://codeup.edu/featured/apida-heritage-month/',
 'https://codeup.edu/featured/women-in-tech-panelist-spotlight/',
 'https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/',
 'https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/',
 'https://codeup.edu/events/women-in-tech-madeleine/',
 'https://codeup.edu/codeup-news/panelist-spotlight-4/']

In [None]:
# best way to build a loop is to do the thing once
# and then figure out how to repeat it

In [12]:
blog_test = requests.get('https://codeup.edu/events/women-in-tech-madeleine/', headers={'User-Agent': 'there is a californian champagne from paul mason'})


In [13]:
blog_soup = BeautifulSoup(blog_test.text, 'html.parser')


In [14]:
# figure out how to grab the title and the content from the blog soup:
# tag: h1 class: entry-title
blog_soup.find('h1', class_='entry-title').text

'Women in Tech: Panelist Spotlight – Madeleine Capper'

In [15]:
# and for the body:
blog_soup.find('div', class_='entry-content').text.strip()[:200]

'Women in tech: Panelist Spotlight – Madeleine Capper\nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of'

In [35]:
# lets iterate through those blog links an build out a structure
# that will allow me to scrape the contents of that page
all_blogs = []
for link in blog_links:
    # get a response from the link element
    response = requests.get(link, headers={'User-Agent': 'Robinson Rulez'})
    #turn the response text into a soup
    soup = BeautifulSoup(response.text, 'html.parser')
    #grab the html element associated with the article body
    title = soup.find('h1', class_='entry-title').text
    #grab the html element associated with the article head
    body = soup.find('div', class_='entry-content').text.strip()
    # toss those two things with labels into a dictionary
    row = {'title': title, 'article': body}
    #add that dictionary to a list of dictionaries
    all_blogs.append(row)
#outside the loop:
# cast the list of dictionaries into a pandas DataFrame
articles = pd.DataFrame(all_blogs)

In [36]:
articles

Unnamed: 0,title,article
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...


In [19]:
# Write the above one as a function in acquire.py and call it
# Testing the function:
blog_links = ['https://codeup.edu/featured/apida-heritage-month/', 'https://codeup.edu/events/women-in-tech-madeleine/']
a.scrape_blog_articles(blog_links)


Unnamed: 0,title,article
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...


In [None]:
# OR

In [154]:
# urls for at least 5 distinct blog posts
url1 = 'https://codeup.edu/featured/apida-heritage-month/'
url2 = 'https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/'   
url3 = 'https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/'
url4 = 'https://codeup.edu/events/women-in-tech-madeleine/'  
url5 = 'https://codeup.edu/codeup-news/panelist-spotlight-4/'



In [155]:
# List of user-agent strings that mimic different web browsers(spoofing)
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0",
    "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0",
    "Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
]

# Randomly select a user-agent from the list for simulating web browser diversity
random_user_agent = random.choice(user_agents)


In [142]:
# Define the 'headers' dictionary with a 'User-Agent' key to simulate different web browsers.
headers = {'User-Agent': random_user_agent}

# Send HTTP requests to multiple URLs with the specified 'User-Agent' header.
url1_response = requests.get(url1, headers=headers)
url2_response = requests.get(url2, headers=headers)
url3_response = requests.get(url3, headers=headers)
url4_response = requests.get(url4, headers=headers)
url5_response = requests.get(url5, headers=headers)
# created 5 response objects

In [143]:
# Print the responses from the URLs 
print('url1_response',url1_response)
print('url2_response',url2_response)
print('url3_response',url3_response)
print('url4_response',url4_response)
print('url5_response',url5_response)

# <Response [200]> means successful

url1_response <Response [200]>
url2_response <Response [200]>
url3_response <Response [200]>
url4_response <Response [200]>
url5_response <Response [200]>


In [None]:
# Extract the content of the response from URL 1 as text.
url1_response.text

# checked only url1_response here

In [45]:
# turn the contents into soup:
soup_url1 = BeautifulSoup(url1_response.text, 'html.parser')
soup_url2 = BeautifulSoup(url2_response.text, 'html.parser')
soup_url3 = BeautifulSoup(url3_response.text, 'html.parser')
soup_url4 = BeautifulSoup(url4_response.text, 'html.parser')
soup_url5 = BeautifulSoup(url5_response.text, 'html.parser')

In [46]:
#use soup to find titles

print('soup_url1:-',soup_url1.find("h1"))  
print('soup_url2:-',soup_url2.find("h1"))  
print('soup_url3:-',soup_url3.find("h1"))  
print('soup_url4:-',soup_url4.find("h1"))  
print('soup_url5:-',soup_url5.find("h1"))  


soup_url1:- <h1 class="entry-title">Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa</h1>
soup_url2:- <h1 class="entry-title">Women in Tech: Panelist Spotlight – Sarah Mellor</h1>
soup_url3:- <h1 class="entry-title">Women in tech: Panelist Spotlight – Rachel Robbins-Mayhill</h1>
soup_url4:- <h1 class="entry-title">Women in Tech: Panelist Spotlight – Madeleine Capper</h1>
soup_url5:- <h1 class="entry-title">Black Excellence in Tech: Panelist Spotlight – Wilmarie De La Cruz Mejia</h1>


In [None]:
# or

In [149]:
# use soup to find titles
title1 = soup_url1.title.string
title2 = soup_url2.title.string
title3 = soup_url3.title.string
title4 = soup_url4.title.string
title5 = soup_url5.title.string

In [74]:
# To find 1st paragraph element
# soup_url1.find('p')

In [150]:
# To find ALL paragraph elements, list_of_p_elements

list_of_p_elements_url1 = soup_url1.find_all('p')
list_of_p_elements_url2 = soup_url2.find_all('p')
list_of_p_elements_url3 = soup_url3.find_all('p')
list_of_p_elements_url4 = soup_url4.find_all('p')
list_of_p_elements_url5 = soup_url5.find_all('p')

In [151]:
# Join the text content of all <p> elements in each list_of_p_elements_urls into a single string for each blogs
#  and hold the contents of the codeup blog posts 
blog1 = ' '.join([element.text for element in list_of_p_elements_url1])
blog2 = ' '.join([element.text for element in list_of_p_elements_url2])
blog3 = ' '.join([element.text for element in list_of_p_elements_url3])
blog4 = ' '.join([element.text for element in list_of_p_elements_url4])
blog5 = ' '.join([element.text for element in list_of_p_elements_url5])

In [152]:
# Create a dictionary of title and content for each blog post
blog1_content_title = {'Title':title1,'Content':blog1}
blog2_content_title = {'Title':title2,'Content':blog2}
blog3_content_title = {'Title':title3,'Content':blog3}
blog4_content_title = {'Title':title4,'Content':blog4}
blog5_content_title = {'Title':title5,'Content':blog5}


In [153]:
blog = (
     blog1_content_title,
     blog2_content_title,
     blog3_content_title,
     blog4_content_title,
     blog5_content_title
)


In [148]:
blog

({'Title': None,
  'Content': 'May 24, 2023 | Featured May is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.  In an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers. Arbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen. At Codeup we take our efforts at inclusivity very seriously. After speaking with Arbeena, we were taught that the term AAPI excludes Desi-American individuals. Hence, we will now use the term Asian Pacific Isl