In [1]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup as soupify

In [2]:
# establish a base url for our requests:
url = 'https://codeup.com/blog/'

In [3]:
# we need to specify some user agent for the codeup site
# non-specified user agents are rejected
header = {'User-Agent': 'hamsandwich'}

In [4]:
# try passing get without a header, we get a 403
get(url, headers=header)

<Response [200]>

In [5]:
# establish our basic soup with the base url
soup = soupify(get(url, headers=header).content)

In [6]:
# soup

In [7]:
# soup.find_all('a', class_='more-link')

In [8]:
soup.select('a.more-link')[0]['href']

'https://codeup.com/codeup-news/dei-report/'

In [9]:
# let's get our list of urls iteratively:
blog_posts = [link['href'] for link in soup.select('a.more-link')]

In [10]:
# start building our function:
# first step: grab the article links:
def get_blog_urls(base_url, header={'User-Agent': 'hamsandwich'}):
    soup = soupify(get(url, headers=header).content)
    return [link['href'] for link in soup.select('a.more-link')]

In [11]:
get_blog_urls(url)

['https://codeup.com/codeup-news/dei-report/',
 'https://codeup.com/codeup-news/diversity-and-inclusion-award/',
 'https://codeup.com/featured/financing-career-transition/',
 'https://codeup.com/tips-for-prospective-students/tips-for-women/',
 'https://codeup.com/cloud-administration/cloud-computing-and-aws/',
 'https://codeup.com/codeup-news/c-suite-award-stephen-noteboom/']

In [12]:
# i want to grab two things:
# title
# content
# task: find where each lives

In [13]:
article_soup = soupify(get(
    'https://codeup.com/codeup-news/dei-report/',
    headers=header
).content)

In [14]:
# if I only have one thing, use select_one
article_soup.select_one('h1.entry-title').text

'Diversity Equity and Inclusion Report'

In [15]:
# let's get the article content now:
article_soup.select_one('div.entry-content').text.strip()

'Codeup is excited to launch our first Diversity Equity, and Inclusion (DEI) report! In over eight years as an organization, we’ve implemented policies and grown our DEI efforts. We are extremely proud of the progress we’ve made as a staff and Codeup community, and we recognize there is more to learn. This report captures some of the ways that we’ve lived our value of Cultivating Inclusive Growth, and how we will continue doing so as we look to the future.\nWe wanted to shine a light on the demographics of our students and staff, and in particular how that compares to the tech industry as a whole. How we collect, organize, and share employee demographic data is informed by standards set by the Equal Employment Opportunity Commission (EEOC).\nWe are proud to celebrate how we’ve grown and are motivated and committed to do more and be better. To view the report visit the link here, or download it below.'

In [16]:
def get_blog_content(base_url):
    blog_links = get_blog_urls(base_url)
    all_blogs = []
    for blog in blog_links:
        blog_soup = soupify(
            get(blog,
                headers=header).content)
        blog_content = {'title': blog_soup.select_one(
            'h1.entry-title').text,
        'content': blog_soup.select_one(
            'div.entry-content').text.strip()}
        all_blogs.append(blog_content)
    return all_blogs

In [17]:
url

'https://codeup.com/blog/'

In [18]:
my_blogs = pd.DataFrame(get_blog_content(url))

In [19]:
my_blogs

Unnamed: 0,title,content
0,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
1,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
2,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...
5,2022 SABJ C-Suite Award Winner: Stephen Noteboom,"Codeup’s Chief Operating Officer, Stephen Note..."


Lets get the second tasking for InShorts:

In [20]:
url = 'https://inshorts.com/en/read'

In [21]:
soup = soupify(get(url).content)

In [22]:
soup.find_all('li')[1].text.lower()

'india'

In [23]:
# try concatenation:
url + '/' + soup.find_all('li')[1].text.lower()

'https://inshorts.com/en/read/india'

In [24]:
def get_cats(base_url):
    soup = soupify(get(base_url).content)
    return [cat.text.lower() for cat in soup.find_all('li')[1:]]

In [25]:
get_cats(url)

['india',
 'business',
 'sports',
 'world',
 'politics',
 'technology',
 'startup',
 'entertainment',
 'miscellaneous',
 'hatke',
 'science',
 'automobile']

In [26]:
# lets make a first go of everything
# we need at this point:
# titles on each category page
# article bodies on each category page
cat_url = url + '/' + 'science'

In [27]:
cat_soup = soupify(get(cat_url).content)

In [28]:
# cat_soup

In [29]:
# cat_soup.select('span')

In [30]:
cat_soup.find_all('span', itemprop='headline')[0].text

'New species of beetle named after Novak Djokovic'

In [31]:
cat_titles = [title.text for title in cat_soup.find_all('span', itemprop='headline')]

In [32]:
# cat_bodies

In [33]:
cat_soup.find_all('div', itemprop='articleBody')[0].text

'Serbian scientists named a new species of beetle after ex-world number one men\'s tennis player Novak Djokovic. The insect, which belongs to Duvalius genus of ground beetles present in Europe and was discovered several years ago in underground pit in Serbia, has been named \'Duvalius djokovici\'. "We feel urged to pay Djokovic back in...way we can," a researcher said.'

In [34]:
cat_bodies = [body.text for body in cat_soup.find_all('div', itemprop='articleBody')]

In [35]:
# did we grab the same number of bodies as titles?
len(cat_bodies) == len(cat_titles)

True

In [36]:
# for title, body in zip(cat_titles, cat_bodies):
#     print(title)
#     print(body)
#     print('------')

In [37]:
def get_all_shorts(base_url):
    cats = get_cats(base_url)
    all_articles = []
    for cat in cats:
        cat_url = base_url + '/' + cat
        print(get(cat_url))
        cat_soup = soupify(get(cat_url).content)
        cat_titles = [
            title.text for title in cat_soup.find_all('span', itemprop='headline')
        ]
        cat_bodies = [
            body.text for body in cat_soup.find_all('div', itemprop='articleBody')]
        cat_articles = [{'title': title,
        'category': cat,
        'body': body} for title, body in zip(
        cat_titles, cat_bodies)]
        print('cat articles length: ',len(cat_articles))
        all_articles.extend(cat_articles)
        print('length of all_articles: ', len(all_articles))
    return all_articles
        

In [38]:
all_articles = get_all_shorts(url)

<Response [200]>
cat articles length:  12
length of all_articles:  12
<Response [200]>
cat articles length:  25
length of all_articles:  37
<Response [200]>
cat articles length:  25
length of all_articles:  62
<Response [200]>
cat articles length:  25
length of all_articles:  87
<Response [200]>
cat articles length:  25
length of all_articles:  112
<Response [200]>
cat articles length:  25
length of all_articles:  137
<Response [200]>
cat articles length:  25
length of all_articles:  162
<Response [200]>
cat articles length:  25
length of all_articles:  187
<Response [200]>
cat articles length:  24
length of all_articles:  211
<Response [200]>
cat articles length:  25
length of all_articles:  236
<Response [200]>
cat articles length:  25
length of all_articles:  261
<Response [200]>
cat articles length:  24
length of all_articles:  285


In [39]:
# all_articles

In [40]:
all_articles = pd.DataFrame(all_articles)

In [41]:
all_articles.category.value_counts()

business         25
sports           25
world            25
politics         25
technology       25
startup          25
entertainment    25
hatke            25
science          25
miscellaneous    24
automobile       24
india            12
Name: category, dtype: int64

In [42]:
all_articles.title.value_counts

<bound method IndexOpsMixin.value_counts of 0                      Bharti Airtel rakes in 61% profit
1              Infosys Gifts Sikka Shares Worth Rs 8.2cr
2           India beat NZ 3-2 to enter CWG hockey finals
3                 India's first Billiards Premier League
4                      Kashmir's famous Dal Lake freezes
                             ...                        
280    Fix for wheel issue that caused electric car r...
281    Withdraw rule that makes 6 airbags mandatory i...
282    Record 5.4 lakh vehicles sold during Navratri ...
283    Mercedes-Benz sees 28% rise in sales in India ...
284    Vehicle registrations during festivals doubled...
Name: title, Length: 285, dtype: object>

In [43]:
pd.to_pickle(my_blogs, 'codeup_blogs')

In [44]:
pd.to_pickle(all_articles, 'inshorts_articles')