In [1]:
from requests import get
import bs4
from bs4 import BeautifulSoup
from os import path
import pandas as pd
import requests

## Codeup Blog Articles

Scrape the article text from the following pages:

    - https://codeup.com/codeups-data-science-career-accelerator-is-here/
    - https://codeup.com/data-science-myths/
    - https://codeup.com/data-science-vs-data-analytics-whats-the-difference/
    - https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/
    - https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/
- Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

{

    'title': 'the title of the article',
    
    'content': 'the full text content of the article'
    
}


In [6]:
url = 'https://codeup.com/data-science/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
response = requests.get(url, headers=headers)


In [7]:
soup = bs4.BeautifulSoup(response.text)
article = soup.find('div', class_='jupiterx-post-content')

article_dict = {'title':[], 'content':[]}

In [8]:
soup.title.string

'Codeup’s Data Science Career Accelerator is Here! - Codeup'

In [16]:
article_dict['title'] = soup.title.string
article_dict['content'] = article.text

AttributeError: 'NoneType' object has no attribute 'text'

In [12]:
article_dict

{'title': 'Codeup’s Data Science Career Accelerator is Here! - Codeup',
 'content': None}

In [13]:
def get_blog_articles(urls, cached = False):
    
    # if we already have the data and cached == True, read it locally
    if cached == True:
        df = pd.read_json('blogs.json')
    
    # if we don't have the data or we want to resave with any new data
    else:
        blogs = []
    
        # loops through urls passed in function
        for blog in urls:

            # web scraping
            headers = {'User-Agent': 'Codeup Data Science'}
            response = requests.get(blog, headers=headers)
            # takes URL and returns a soup object of the text
            soup = bs4.BeautifulSoup(response.text)
            article = soup.find('div', class_='jupiterx-post-content')

            # creates empty dictionary to hold the article title and content
            article_dict = {'title':[], 'content':[]}
            # adds title to dict
            article_dict['title'] = soup.title.string
            # adds article to dict
            article_dict['content'] = article
        
            # adds this dict of the article to the blog list
            blogs.append(article_dict)
        
        # save it for next time
        blogs = pd.DataFrame(blogs)
        blogs.to_json('blogs.json')
        
    return blogs

In [14]:
urls =  ['https://codeup.com/codeup-news/codeup-launches-first-podcast-hire-tech/', 
            'https://codeup.com/tips-for-prospective-students/why-you-need-the-best-coding-bootcamp-instructors/',
            'https://codeup.com/codeup-news/codeup-candidate-for-accreditation/',
            'https://codeup.com/codeup-news/codeup-takes-over-more-of-the-historic-vogue-building/',
            'https://codeup.com/codeup-news/inclusion-at-codeup-during-pride-month-and-always/']

get_blog_articles(urls)

Unnamed: 0,title,content
0,Codeup Launches First Podcast: Hire Tech - Codeup,
1,Why You Need the Best Coding Bootcamp Instruct...,
2,Announcing our Candidacy for Accreditation! - ...,
3,Codeup Takes Over More of the Historic Vogue B...,
4,Inclusion at Codeup During Pride Month (and Al...,


## News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

- Business
- Sports
- Technology
- Entertainment

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

{

    'title': 'The article title',
    
    'content': 'The article content',
    
    'category': 'business' # for example
    
}

Hints:

- Start by inspecting the website in your browser. Figure out which elements will be useful.
- Start by creating a function that handles a single article and produces a dictionary like the one above.
- Next create a function that will find all the articles on a single page and call the function you created in the last step for every article on the page.
- Now create a function that will use the previous two functions to scrape the articles from all the pages that you need, and do any additional processing that needs to be done.

In [None]:
response = requests.get('http://www.inshorts.com/en/news/musk-takes-a-jibe-at-rival-car-companies-says-best-service-is-not-needing-service-1627294221573', headers={'User-Agent': 'Inshorts'})

In [None]:
response.ok

In [None]:
response.text

In [None]:
url = 'http://www.inshorts.com/en/news/musk-takes-a-jibe-at-rival-car-companies-says-best-service-is-not-needing-service-1627294221573'
news_category = url.split('/')[-1]
data = requests.get(url)
soup = bs4.BeautifulSoup(data.content, 'html.parser')

In [None]:
# finding article headline
soup.find('span', attrs={"itemprop": "headline"}).string

In [None]:
# finding article text
soup.find('div', attrs={"itemprop": "articleBody"}).string

In [None]:
# finding author
soup.find('span', attrs={"author"}).string

In [None]:
# finding date
soup.find('span', attrs={"date"}).string

In [None]:
# category as assigned in url
url.split('/')[-4]

In [None]:
def get_inshorts_dataset(urls, cached=False):
    '''
    Function to scrape articles from Inshorts.com; If cached == False, runs code to scrape data
    from chosen url articles, add to dictionary, save as df in json file. If cached == True,
    reads the saved json file to a df.
    '''
    # if cached, we read already saved json file to df
    if cached == True:
        articles = pd.read_json('inshorts_articles.json')

    # cached == False, if we don't have the data or we want to resave with any new data
    else:
        
        # empty list to add individual article dictionaries to
        articles = []
        
        # loops through selected articles from Inshorts
        for article in urls:
            
            # dictionary for article and information we are going to find
            article_dict = {'headline':'','author':'','date':'','article':'','category':''}
            
            # web scraping
            headers = {'User-Agent': 'Inshorts'}
            data = requests.get(article, headers)
            # takes URL and returns a soup object of the text
            soup = bs4.BeautifulSoup(data.content, 'html.parser')

            # specific article information to add to dictionary
            article_dict['headline'] = soup.find('span', attrs={"itemprop": "headline"}).string
            article_dict['author'] = soup.find('span', attrs={"author"}).string
            article_dict['date'] = soup.find('span', attrs={"date"}).string
            article_dict['article'] = soup.find('div', attrs={"itemprop": "articleBody"}).string
#            article_dict['category'] = url.split('/')[-2]
            article_dict['category'] = soup.find('li', attrs={"class": "active-category"}).string

            # adding dictionary to list
            articles.append(article_dict)

        # converting list of dictionaries to a df
        articles = pd.DataFrame(articles)
        articles = articles[['headline', 'author','date','article', 'category']]
        # Write df to a json file for faster access
        articles.to_json('inshorts_articles.json')
        
    return articles

In [None]:
soup.find('li', attrs={"class": "active-category"})

In [None]:
urls = ['http://www.inshorts.com/en/news/musk-takes-a-jibe-at-rival-car-companies-says-best-service-is-not-needing-service-1627294221573',
       'http://www.inshorts.com/en/news/how-does-the-medal-tally-look-like-after-mondays-events-at-tokyo-olympics-1627311525736',
       'http://www.inshorts.com/en/news/tesla-cars-autopilot-system-confuses-the-moon-with-traffic-light-video-viral-1627222148462',
       'http://www.inshorts.com/en/news/driving-licence-of-actress-yashika-whos-critical-after-car-accident-seized-by-police-1627308240879']

get_inshorts_dataset(urls)

In [None]:
soup