In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import re
import acquire as aq

In [None]:
url = 'https://codeup.com/data-science/math-in-data-science/'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
response = get(url, headers=headers)

In [None]:
print(response.text[:400])

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
type(soup)

In [None]:
soup.title.string

## Exercises

By the end of this exercise, you should have a file named acquire.py that contains the specified functions. If you wish, you may break your work into separate files for each website (e.g. acquire_codeup_blog.py and acquire_news_articles.py), but the end function should be present in acquire.py (that is, acquire.py should import get_blog_articles from the acquire_codeup_blog module.)



# 1. Codeup Blog Articles

Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named __get_blog_articles__ that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

In [None]:
{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}

In [None]:
url = 'https://codeup.com/blog/'
headers = {'user-agent': 'Kalpana Data Science Cohort'}
response = get(url, headers=headers)

In [None]:
print(response.text[:400])

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
soup.

In [None]:
def get_blog_articles(use_cache=True):
    
    # establish a filename for the local csv
    filename = 'codeup_blog_articles.csv'
    
    if use_cache:
        
        # check to see if a local copy already exists
        if os.path.exists(filename):
            print('Reading from local CSV...')
            # if so, return the local csv
            return pd.read_csv(filename)
        
    # otherwise, scrape the data from codeup.com
    print('Gathering blog articles from codeup.com...')
    
    articles = []

    # go to blog homepage
    url = 'https://codeup.com/blog/'
    headers = {'user-agent': 'Kalpana Data Science Cohort'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get url for next page of articles
    # (returns None if there are no more pages)
    next_page = soup.select_one('.pagination.clearfix').div.a

    # get the urls for the rest of the articles on this page
    urls = []
    for article in soup.select('article'):
        #for link in article.select('.more-link'):
        for link in article.select('.entry-featured-image-url'):
            urls.append(link.attrs['href'])

    # go to each article page
    for url in urls:
        response = get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # pull article info and append to list
        dct = {}
        dct['title'] = soup.select_one('.entry-title').text
        dct['content'] = soup.select_one('.entry-content').text.strip()
        articles.append(dct)

    page_counter = 1
    print(f'{page_counter} pages complete     ', end='\r')

    # check whether there is a next page
    while next_page != None:
        # go to the next page
        url = next_page.attrs['href']
        response = get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # get url for next page of articles
        # (this will return None if there are no more pages)
        next_page = soup.select_one('.pagination.clearfix').div.a

        # get all the urls for articles on this page
        urls = []
        for article in soup.select('article'):
            for link in article.select('.entry-featured-image-url'):
                urls.append(link.attrs['href'])

        # go to each article page
        for url in urls:
            response = get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')

            # pull article info and append to list
            dct = {}
            dct['title'] = soup.select_one('.entry-title').text
            dct['content'] = soup.select_one('.entry-content').text.strip()
            articles.append(dct)

        page_counter += 1
        print(f'{page_counter} pages complete     ', end='\r')
        
    print(f'{page_counter} pages scraped. No more pages available.')
    
    articles = pd.DataFrame(articles)
    
    # cache local copy
    print('Writing to local CSV...')
    articles.to_csv(filename, index=False)
    print('Writing to local CSV complete.')
    
    return articles

In [2]:
df = aq.get_blog_articles()

Reading from local CSV...


In [3]:
df.head()

Unnamed: 0,title,content
0,Is a Career in Tech Recession-Proof?,"Given the current economic climate, many econo..."
1,Codeup X Superhero Car Show & Comic Con,Codeup had a blast at the San Antonio Superher...
2,What Jobs Can You Get After a Coding Bootcamp?...,If you’re considering a career in web developm...
3,Codeup’s New Dallas Campus,Codeup’s Dallas campus has a new location! For...
4,Codeup TV Commercial,Codeup has officially made its TV debut! Our c...


# 2. News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:


- Business


- Sports


- Technology


- Entertainment


The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

In [None]:
{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}

In [None]:
url = 'https://inshorts.com/en/read'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
articles = []
dct = {}
dct['title'] = soup.select('.news-card')[0].select('.news-card-title')[0].span.text
dct['content'] = soup.select('.news-card')[0].select('.news-card-content')[0].div.text

In [None]:
def get_news_articles(categories=['business', 'sports', 
                                  'technology', 'entertainment'], 
                      use_cache=True):
    
    # establish a filename for the local csv
    filename = 'news_articles.csv'
    
    if use_cache:
        # check to see if a local copy already exists
        if os.path.exists(filename):
            print('Reading from local CSV...')
            # if so, return the local csv
            return pd.read_csv(filename)
        
    # otherwise, scrape the data from codeup.com
    print('Reading blog articles from inshorts.com...')

    articles = []

    for category in categories:

        url = f'https://inshorts.com/en/read/{category}'
        response = get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        for card in soup.select('.news-card'):
            dct = {}
            dct['title'] = card.select('.news-card-title')[0].span.text
            dct['author'] = card.select('.author')[0].text
            dct['content'] = card.select_one('.news-card-content').div.text
            dct['category'] = category
            articles.append(dct)
            
    articles = pd.DataFrame(articles)
    
    # cache local copy
    print('Writing to local CSV...')
    articles.to_csv(filename, index=False)
            
    return articles

In [4]:
df2 = aq.get_news_articles()
df2.head()

Reading from local CSV...


Unnamed: 0,title,author,content,category
0,Adani Transmission becomes India's 8th most va...,Hiral Goyal,Adani Transmission has entered the club of Ind...,business
1,Musk cites whistleblower's claims in new notic...,Ridham Gambhir,Tesla CEO Elon Musk's legal team has filed ano...,business
2,No plan to rebrand Zomato app to Eternal: CEO ...,Hiral Goyal,Zomato CEO Deepinder Goyal clarified in an exc...,business
3,"Cancelling AC, first-class confirmed train tic...",Ridham Gambhir,The Finance Ministry stated that cancellation ...,business
4,China arrests over 230 people tied to its larg...,Hiral Goyal,China has announced that 234 people who are su...,business
