# Web scraping exercises

In [1]:
#Import dependencies
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import json

## Exercise 1

Codeup Blog Articles

Visit [Codeup's Blog](https://codeup.com/blog/) and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article.

In [2]:
#Define headers
headers = {'User-Agent': 'Codeup Data Science'}

In [3]:
#Scrape blog homepage for links
url = 'https://codeup.com/blog/'

response = get(url, headers=headers)

soup = BeautifulSoup(response.content, 'html.parser')

more_links = soup.find_all('a', class_='more-link')

more_links

[<a class="more-link" href="https://codeup.com/codeup-news/panelist-spotlight-4/">read more</a>,
 <a class="more-link" href="https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-stephanie-jones/">read more</a>,
 <a class="more-link" href="https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-james-cooper/">read more</a>,
 <a class="more-link" href="https://codeup.com/events/black-excellence-in-tech-panelist-spotlight/">read more</a>,
 <a class="more-link" href="https://codeup.com/tips-for-prospective-students/coding-bootcamp-or-self-learning/">read more</a>,
 <a class="more-link" href="https://codeup.com/codeup-news/codeup-best-bootcamps/">read more</a>]

In [4]:
#Extract the links into something I can use
links_list = [link['href'] for link in more_links]

links_list

['https://codeup.com/codeup-news/panelist-spotlight-4/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-stephanie-jones/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-james-cooper/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight/',
 'https://codeup.com/tips-for-prospective-students/coding-bootcamp-or-self-learning/',
 'https://codeup.com/codeup-news/codeup-best-bootcamps/']

In [5]:
#Make a request
response = get('https://codeup.com/codeup-news/panelist-spotlight-4/', headers=headers)

In [6]:
#Create the soup and investigate
soup = BeautifulSoup(response.content, 'html.parser')
example = soup.find('h1')
example.text

'Black Excellence in Tech: Panelist Spotlight – Wilmarie De La Cruz Mejia'

In [7]:
#Access the date published
example2 = soup.find('span', class_='published')
example2.text

'Feb 16, 2023'

In [8]:
#Access the article content
example3 = soup.find('div', class_='entry-content')
example3.text

'\nBlack excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia\n\nCodeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!\xa0\xa0\nMeet Wilmarie!\nWilmarie De\xa0La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus.\xa0\nWilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup.\xa0\nWe asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and be in a positive learning environment.”\nWe hope you can join us on February 22nd to sit in on an insightful conversation with Wilmarie and all of our panelists!\n'

In [9]:
#Loop through the links to collect the relevant information from the blog posts
article_info = []

for link in links_list:
    
    response = get(link, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    info_dict = {'title': soup.find('h1').text,
                 'link': link,
                 'date_published': soup.find('span', class_='published').text,
                 'content': soup.find('div', class_='entry-content').text.strip()}
    
    article_info.append(info_dict)
    
article_info[0]    

{'title': 'Black Excellence in Tech: Panelist Spotlight – Wilmarie De La Cruz Mejia',
 'link': 'https://codeup.com/codeup-news/panelist-spotlight-4/',
 'date_published': 'Feb 16, 2023',
 'content': 'Black excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia\n\nCodeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!\xa0\xa0\nMeet Wilmarie!\nWilmarie De\xa0La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus.\xa0\nWilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup.\xa0\nWe asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and be i

In [10]:
#Create a function to collect the information and cache it as a json file
def get_blog_articles(article_list):
    
    file = 'blog_posts.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
        
            return json.load(f)
    
    headers = {'User-Agent': 'Codeup Data Science'}
    
    article_info = []
    
    for article in article_list:
        
        response = get(article, headers=headers)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        info_dict = {'title': soup.find('h1').text,
                     'link': link,
                     'date_published': soup.find('span', class_='published').text,
                     'content': soup.find('div', class_='entry-content').text}
    
        article_info.append(info_dict)
        
    with open(file, 'w') as f:
        
        json.dump(article_info, f)
        
    return article_info    

In [11]:
#Run my function to make sure it works!
article_info = get_blog_articles(links_list)
article_info[0]

{'title': 'Black Excellence in Tech: Panelist Spotlight – Wilmarie De La Cruz Mejia',
 'link': 'https://codeup.com/codeup-news/codeup-best-bootcamps/',
 'date_published': 'Feb 16, 2023',
 'content': '\nBlack excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia\n\nCodeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!\xa0\xa0\nMeet Wilmarie!\nWilmarie De\xa0La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus.\xa0\nWilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup.\xa0\nWe asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and b

## Exercise 2

News Articles

We will now be scraping text data from [inshorts](https://inshorts.com/), a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

Business  
Sports  
Technology  
Entertainment  

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

In [12]:
#Make a request of the business page
response2 = get('https://inshorts.com/en/read/business')
soup2 = BeautifulSoup(response2.content, 'html.parser')
soup2.text[:400]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBusiness News: World and India Business News in English with Inshorts\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ntoggle menuMenu\n\n\n\n\n\n\n\n\xa0\n\n\n\n\n\nEnglish\nहिन्दी\n\n\n\nCategories\n\n All News \n India \n Business \n Sports \n World \n Politics \n Technology \n Startup \n Entertainment \n Miscellaneous \n Hatke \n Science \n Automobile \n\n\n\n\n\n\n\n\n\n\n\n\n          For the best experienc'

In [13]:
#Access the titles
titles = soup2.find_all('span', itemprop='headline')
titles

[<span itemprop="headline">Sachin Tendulkar and his wife meet Bill Gates; share pics</span>,
 <span itemprop="headline">'Best wishes to my classmate,' writes Gates in book gifted to Mahindra, latter shares pic</span>,
 <span itemprop="headline">Ambanis should get Z+ security cover across India, abroad; cost to be borne by them: SC</span>,
 <span itemprop="headline">People consuming 30 GB and paying almost nothing: Airtel's Mittal</span>,
 <span itemprop="headline">Apple supplier Foxlink's fire safety systems faulty: Fire official</span>,
 <span itemprop="headline">India's GDP growth slows down to 4.4% in Oct-Dec quarter</span>,
 <span itemprop="headline">Gujarat receives ₹40,000-cr investment for green energy projects</span>,
 <span itemprop="headline">Apple supplier Foxlink declares 30-day holiday at fire-hit factory</span>,
 <span itemprop="headline">Govt aiming to pass Telecom Bill in Monsoon Session: Vaishnaw</span>,
 <span itemprop="headline">India's automobile making sector to hi

In [14]:
#Access the summaries
summaries = soup2.find_all('div', itemprop='articleBody')
summaries[0]

<div itemprop="articleBody">Former cricketer Sachin Tendulkar and his wife Anjali Tendulkar met with Microsoft Co-founder Bill Gates in Mumbai on Tuesday. Sharing pictures of the meeting on social media, Sachin Tendulkar wrote, "We are all students for life. Today was a wonderful learning opportunity to gain perspectives on philanthropy - including children's healthcare, which our Foundation works on." </div>

In [15]:
#Make sure I'm grabbing an equal number of titles and summaries
len(titles), len(summaries)

(25, 25)

In [16]:
#Define a function to scrape articles from one topic
def scrape_one_page(topic):
    
    base_url = 'https://inshorts.com/en/read/'
    
    response = get(base_url + topic)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    titles = soup.find_all('span', itemprop='headline')
    
    summaries = soup.find_all('div', itemprop='articleBody')
    
    summary_list = []
    
    for i in range(len(titles)):
        
        temp_dict = {'category': topic,
                     'title': titles[i].text,
                     'content': summaries[i].text}
        
        summary_list.append(temp_dict)
        
    return summary_list    

In [17]:
#Test my function on the business page
business_test = scrape_one_page('business')
business_test[0]

{'category': 'business',
 'title': 'Sachin Tendulkar and his wife meet Bill Gates; share pics',
 'content': 'Former cricketer Sachin Tendulkar and his wife Anjali Tendulkar met with Microsoft Co-founder Bill Gates in Mumbai on Tuesday. Sharing pictures of the meeting on social media, Sachin Tendulkar wrote, "We are all students for life. Today was a wonderful learning opportunity to gain perspectives on philanthropy - including children\'s healthcare, which our Foundation works on." '}

In [18]:
#Define a function that will scrape information about an array of topics
def get_news_articles(topic_list):
    
    file = 'news_articles.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
            
            return json.load(f)
    
    final_list = []
    
    for topic in topic_list:
        
        final_list.extend(scrape_one_page(topic))
        
    with open(file, 'w') as f:
        
        json.dump(final_list, f)
        
    return final_list    

In [19]:
#Test my function!
topics = ['business', 'sports', 'technology', 'entertainment']

final_list = get_news_articles(topics)
final_list[0]

{'category': 'business',
 'title': 'Sachin Tendulkar and his wife meet Bill Gates; share pics',
 'content': 'Former cricketer Sachin Tendulkar and his wife Anjali Tendulkar met with Microsoft Co-founder Bill Gates in Mumbai on Tuesday. Sharing pictures of the meeting on social media, Sachin Tendulkar wrote, "We are all students for life. Today was a wonderful learning opportunity to gain perspectives on philanthropy - including children\'s healthcare, which our Foundation works on." '}

In [20]:
#Confirm I've collected enough information
len(final_list)

100

In [21]:
#Turn it into a dataframe!
final_df = pd.DataFrame(final_list)
final_df.head()

Unnamed: 0,category,title,content
0,business,Sachin Tendulkar and his wife meet Bill Gates;...,Former cricketer Sachin Tendulkar and his wife...
1,business,"'Best wishes to my classmate,' writes Gates in...",Businessman Anand Mahindra on Tuesday met Micr...
2,business,Ambanis should get Z+ security cover across In...,The Supreme Court on Tuesday stated that the Z...
3,business,People consuming 30 GB and paying almost nothi...,Bharti Airtel is looking to raise mobile phone...
4,business,Apple supplier Foxlink's fire safety systems f...,Most of the fire safety equipment at Apple sup...
