# Web scraping exercises

In [1]:
#Disable autosave
%autosave 0

Autosave disabled


In [2]:
#Import dependencies
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import json

## Exercise 1

Codeup Blog Articles

Visit [Codeup's Blog](https://codeup.com/blog/) and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article.

URLs for blog posts:  
https://codeup.com/data-science/recession-proof-career/  
https://codeup.com/codeup-news/codeup-x-comic-con/  
https://codeup.com/featured/series-part-3-web-development/  
https://codeup.com/codeup-news/codeup-dallas-campus/  
https://codeup.com/codeup-news/codeup-tv-commercial/

In [3]:
#Define headers
headers = {'User-Agent': 'Codeup Data Science'}

In [4]:
#Make a request
response = get('https://codeup.com/data-science/recession-proof-career/', headers=headers)

In [5]:
#Create the soup and investigate
soup = BeautifulSoup(response.content, 'html.parser')
example = soup.find('h1')
example.text

'Is a Career in Tech Recession-Proof?'

In [6]:
#Access the date published
example2 = soup.find('span', class_='published')
example2.text

'Aug 12, 2022'

In [7]:
#Access the article content
example3 = soup.find('div', class_='entry-content')
example3.text

'\nGiven the current economic climate, many economists are considering the U.S. to be entering a recession. This can cause confusion, fear, and uncertainty, especially as it pertains to job security.\nTo ease some of those feelings, below you’ll find some careers in tech that tend to hold up better than others amid a recession. In the event of a recession, companies will likely shift to digital strategies, making these careers in tech valuable and highly coveted.\n\xa0\n\n\nProgrammer/Developer\nNo matter the programming language you’ve mastered, having the knowledge alone makes you extremely valuable. The coding skills you possess as a programmer or developer are in-demand for companies looking to build or enhance their websites, and enhance their consumer experience. According to the U.S. Bureau of Labor Statistics, jobs in software development are expected to grow 22% by 2030. This is much faster than the average career.\n\n\xa0\n\n\nCloud Administrator\nMore businesses are transiti

In [8]:
#Create a list of links to scrape
links = ['https://codeup.com/data-science/recession-proof-career/',
         'https://codeup.com/codeup-news/codeup-x-comic-con/',
         'https://codeup.com/featured/series-part-3-web-development/',
         'https://codeup.com/codeup-news/codeup-dallas-campus/',
         'https://codeup.com/codeup-news/codeup-tv-commercial/']

In [9]:
#Loop through the links to collect the relevant information from the blog posts
article_info = []

for link in links:
    
    info_dict = {}
    
    response = get(link, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    info_dict['title'] = soup.find('h1').text
    
    info_dict['date_published'] = soup.find('span', class_='published').text
    
    info_dict['content'] = soup.find('div', class_='entry-content').text
    
    article_info.append(info_dict)
    
article_info    

[{'title': 'Is a Career in Tech Recession-Proof?',
  'date_published': 'Aug 12, 2022',
  'content': '\nGiven the current economic climate, many economists are considering the U.S. to be entering a recession. This can cause confusion, fear, and uncertainty, especially as it pertains to job security.\nTo ease some of those feelings, below you’ll find some careers in tech that tend to hold up better than others amid a recession. In the event of a recession, companies will likely shift to digital strategies, making these careers in tech valuable and highly coveted.\n\xa0\n\n\nProgrammer/Developer\nNo matter the programming language you’ve mastered, having the knowledge alone makes you extremely valuable. The coding skills you possess as a programmer or developer are in-demand for companies looking to build or enhance their websites, and enhance their consumer experience. According to the U.S. Bureau of Labor Statistics, jobs in software development are expected to grow 22% by 2030. This is

In [10]:
#Create a function to collect the information and cache it as a json file
def get_blog_articles(article_list):
    
    file = 'blog_posts.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
        
            return json.load(f)
    
    headers = {'User-Agent': 'Codeup Data Science'}
    
    article_info = []
    
    for article in article_list:
        
        info_dict = {}
        
        response = get(article, headers=headers)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        info_dict['title'] = soup.find('h1').text
    
        info_dict['date_published'] = soup.find('span', class_='published').text
    
        info_dict['content'] = soup.find('div', class_='entry-content').text
    
        article_info.append(info_dict)
        
    with open(file, 'w') as f:
        
        json.dump(article_info, f)
        
    return article_info    

In [11]:
#Run my function to make sure it works!
article_info = get_blog_articles(links)
article_info

[{'title': 'Is a Career in Tech Recession-Proof?',
  'date_published': 'Aug 12, 2022',
  'content': '\nGiven the current economic climate, many economists are considering the U.S. to be entering a recession. This can cause confusion, fear, and uncertainty, especially as it pertains to job security.\nTo ease some of those feelings, below you’ll find some careers in tech that tend to hold up better than others amid a recession. In the event of a recession, companies will likely shift to digital strategies, making these careers in tech valuable and highly coveted.\n\xa0\n\n\nProgrammer/Developer\nNo matter the programming language you’ve mastered, having the knowledge alone makes you extremely valuable. The coding skills you possess as a programmer or developer are in-demand for companies looking to build or enhance their websites, and enhance their consumer experience. According to the U.S. Bureau of Labor Statistics, jobs in software development are expected to grow 22% by 2030. This is

## Exercise 2

News Articles

We will now be scraping text data from [inshorts](https://inshorts.com/), a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

Business  
Sports  
Technology  
Entertainment  

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

In [12]:
#Make a request of the business page
response2 = get('https://inshorts.com/en/read/business')
soup2 = BeautifulSoup(response2.content, 'html.parser')
soup2.text[:400]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBusiness News: World and India Business News in English with Inshorts\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ntoggle menuMenu\n\n\n\n\n\n\n\xa0\n\n\n\nEnglish\nहिन्दी\n\n\n\nCategories\n\n All News \n India \n Business \n Sports \n World \n Politics \n Technology \n Startup \n Entertainment \n Miscellaneous \n Hatke \n Science \n Automobile \n\n\n\n\n\n\n\n\n\n\n\n\n          For the best experience us'

In [13]:
#Access the titles
titles = soup2.find_all('span', itemprop='headline')
titles

[<span itemprop="headline">India's GDP grows at 13.5% in first quarter of FY23, fastest in a year</span>,
 <span itemprop="headline">Snap to lay off 20% of staff, cancel several projects to cut costs</span>,
 <span itemprop="headline">Musk seeks to delay Twitter trial to Nov amid whistleblower's claims</span>,
 <span itemprop="headline">2 top executives at Snap quit hours after report about 20% layoffs emerges</span>,
 <span itemprop="headline">Viral video shows Amazon parcels thrown out of train at station, Railways clarifies</span>,
 <span itemprop="headline">World's 3rd richest person Adani's wealth surged over 13 times in 2.5 years</span>,
 <span itemprop="headline">Dell among firms conducting stay interviews to contain high attrition rates: Report</span>,
 <span itemprop="headline">Russia's Gazprom halts gas supply to Europe via major pipeline</span>,
 <span itemprop="headline">Japan calls for $24 bn investment to boost battery competitiveness</span>,
 <span itemprop="headline">Mo

In [14]:
#Access the summaries
summaries = soup2.find_all('div', itemprop='articleBody')
summaries

[<div itemprop="articleBody">India's GDP grew at 13.5% in the first quarter of FY23, achieving its fastest annual expansion in a year, government data showed. However, it is lower than the Reserve Bank of India's (RBI) projection of 16.2% GDP growth in the first quarter of FY23. India's GDP growth in the first quarter of FY22 was 20.1%.</div>,
 <div itemprop="articleBody">Snap said on Wednesday it will lay off 20% of its staff and shut down original shows, in-app games and several other projects as part of a corporate restructuring. As per CEO  Evan Spiegel, Snap's current revenue growth rate is "well below" what the company was expecting. The cut will help Snap save an estimated $500 million in costs annually. </div>,
 <div itemprop="articleBody">Tesla CEO Elon Musk is seeking to delay the trial with Twitter until November, as per court filings. The five-day trial is currently scheduled to start on October 17. The court filings came after Musk's legal team sent a fresh letter to Twitt

In [15]:
#Make sure I'm grabbing an equal number of titles and summaries
len(titles), len(summaries)

(25, 25)

In [16]:
#Define a function to scrape articles from one topic
def scrape_one_page(topic):
    
    base_url = 'https://inshorts.com/en/read/'
    
    response = get(base_url + topic)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    titles = soup.find_all('span', itemprop='headline')
    
    summaries = soup.find_all('div', itemprop='articleBody')
    
    summary_list = []
    
    for i in range(len(titles)):
        
        temp_dict = {}
        
        temp_dict['title'] = titles[i].text
        
        temp_dict['content'] = summaries[i].text
        
        temp_dict['category'] = topic
        
        summary_list.append(temp_dict)
        
    return summary_list    

In [17]:
#Test my function on the business page
business_test = scrape_one_page('business')
business_test

[{'title': "India's GDP grows at 13.5% in first quarter of FY23, fastest in a year",
  'content': "India's GDP grew at 13.5% in the first quarter of FY23, achieving its fastest annual expansion in a year, government data showed. However, it is lower than the Reserve Bank of India's (RBI) projection of 16.2% GDP growth in the first quarter of FY23. India's GDP growth in the first quarter of FY22 was 20.1%.",
  'category': 'business'},
 {'title': 'Snap to lay off 20% of staff, cancel several projects to cut costs',
  'content': 'Snap said on Wednesday it will lay off 20% of its staff and shut down original shows, in-app games and several other projects as part of a corporate restructuring. As per CEO  Evan Spiegel, Snap\'s current revenue growth rate is "well below" what the company was expecting. The cut will help Snap save an estimated $500 million in costs annually. ',
  'category': 'business'},
 {'title': "Musk seeks to delay Twitter trial to Nov amid whistleblower's claims",
  'cont

In [18]:
#Define a function that will scrape information about an array of topics
def get_news_articles():
    
    file = 'news_articles.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
            
            return json.load(f)
    
    topic_list = ['business', 'sports', 'technology', 'entertainment']
    
    final_list = []
    
    for topic in topic_list:
        
        final_list.extend(scrape_one_page(topic))
        
    with open(file, 'w') as f:
        
        json.dump(final_list, f)
        
    return final_list    

In [19]:
#Test my function!
final_list = get_news_articles()
final_list

[{'title': "India's GDP grows at 13.5% in first quarter of FY23, fastest in a year",
  'content': "India's GDP grew at 13.5% in the first quarter of FY23, achieving its fastest annual expansion in a year, government data showed. However, it is lower than the Reserve Bank of India's (RBI) projection of 16.2% GDP growth in the first quarter of FY23. India's GDP growth in the first quarter of FY22 was 20.1%.",
  'category': 'business'},
 {'title': "Musk seeks to delay Twitter trial to Nov amid whistleblower's claims",
  'content': "Tesla CEO Elon Musk is seeking to delay the trial with Twitter until November, as per court filings. The five-day trial is currently scheduled to start on October 17. The court filings came after Musk's legal team sent a fresh letter to Twitter to include whistleblower Peiter Zatko's claims as another reason to end the $44 billion deal.",
  'category': 'business'},
 {'title': 'Snap to lay off 20% of staff, cancel several projects to cut costs',
  'content': 'Sn

In [20]:
#Confirm I've collected enough information
len(final_list)

100