# Data Acquisition 

## Imports

In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd

## Codeup Blog Articles

Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

```python
{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}
```

Plus any additional properties you think might be helpful.



In [2]:
url = 'https://codeup.edu/blog/'
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers=headers)

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

### class='more-link' is one way to get access to each article link

In [4]:
soup.select('.more-link') #soup.find_all('a', class_='more-link')

[<a class="more-link" href="https://codeup.edu/featured/apida-heritage-month/">read more</a>,
 <a class="more-link" href="https://codeup.edu/featured/women-in-tech-panelist-spotlight/">read more</a>,
 <a class="more-link" href="https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/">read more</a>,
 <a class="more-link" href="https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/">read more</a>,
 <a class="more-link" href="https://codeup.edu/events/women-in-tech-madeleine/">read more</a>,
 <a class="more-link" href="https://codeup.edu/codeup-news/panelist-spotlight-4/">read more</a>]

In [5]:
soup.select('.more-link')[0]

<a class="more-link" href="https://codeup.edu/featured/apida-heritage-month/">read more</a>

In [6]:
soup.select('.more-link')[0]['href']

'https://codeup.edu/featured/apida-heritage-month/'

### List comprehension review

In [7]:
[n for n in range(1, 11)]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

### Using list comprehension to get all the links out

In [8]:
links = [link['href'] for link in soup.select('.more-link')]
links

['https://codeup.edu/featured/apida-heritage-month/',
 'https://codeup.edu/featured/women-in-tech-panelist-spotlight/',
 'https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/',
 'https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/',
 'https://codeup.edu/events/women-in-tech-madeleine/',
 'https://codeup.edu/codeup-news/panelist-spotlight-4/']

### Get title and content from article

In [9]:
url = links[0]
response = get(url, headers=headers)
soup = BeautifulSoup(response.text)

In [10]:
soup.find('h1', class_='entry-title').text

'Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa'

In [11]:
soup.find('div', class_='entry-content').text.strip()

'May is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.\n\nIn an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers.\nArbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen.\nAt Codeup we take our efforts at inclusivity very seriously. After speaking with Arbeena, we were taught that the term AAPI excludes Desi-American individuals. Hence, we will now use the term Asian Pacific Islander Desi American (APIDA).\nHere is how the rest

### Put it together

In [12]:
url = 'https://codeup.edu/blog/'
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers=headers)

soup = BeautifulSoup(response.content, 'html.parser')

links = [link['href'] for link in soup.select('.more-link')]

articles = []

for url in links:
    
    url_response = get(url, headers=headers)
    soup = BeautifulSoup(url_response.text)
    
    title = soup.find('h1', class_='entry-title').text
    content = soup.find('div', class_='entry-content').text.strip()
    
    article_dict = {
        'title': title,
        'content': content
    }
    
    articles.append(article_dict)

In [13]:
articles[0:5]

[{'title': 'Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa',
  'content': 'May is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.\n\nIn an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers.\nArbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen.\nAt Codeup we take our efforts at inclusivity very seriously. After speaking with Arbeena, we were taught that the term AAPI excludes Desi-America

### Put in df

In [14]:
blog_article_df = pd.DataFrame(articles)
blog_article_df

Unnamed: 0,title,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...


In [15]:
blog_article_df.to_csv('blog_articles.csv', index=False)

## News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

* Business
* Sports
* Technology
* Entertainment

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

```python
{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}
```

In [16]:
url = 'https://inshorts.com/en/read'
response = get(url)
soup = BeautifulSoup(response.content, 'html.parser')

### Get title

In [17]:
soup.find_all('span', itemprop='headline')[0].text

'Sulabh founder Bindeshwar Pathak passes away'

### Get content

In [18]:
soup.find_all('div', itemprop='articleBody')[0].text

'Sulabh founder Bindeshwar Pathak has passed away at a hospital in Delhi. Pathak was known around the world for his wide ranging work in the sanitation field to improve public health, advance social progress, and improve human rights. He established the Sulabh Sanitation Movement in 1970.'

### Put it together

In [20]:
categories = ['business', 'sports', 'technology', 'entertainment']

inshorts = []

for category in categories:
    
    url = 'https://inshorts.com/en/read' + '/' + category
    response = get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    titles = [span.text for span in soup.find_all('span', itemprop='headline')]
    contents = [div.text for div in soup.find_all('div', itemprop='articleBody')]
    
    for i in range(len(titles)):
        
        article = {
            'title': titles[i],
            'content': contents[i],
            'category': category,
        }
        
        inshorts.append(article)

In [21]:
inshorts[0:5]

[{'title': 'Cybersecurity firm Secureworks to lay off 15% of its workforce',
  'content': "Secureworks said it will lay off 15% of its workforce. The cybersecurity firm said it would incur about $14.2 million in expenses due to the layoffs, mostly related to employee termination benefits and real-estate costs. This would be Secureworks' second round of layoffs after 9% workforce reduction in February. Secureworks said the departing employees' last day will be August 25.\n",
  'category': 'business'},
 {'title': 'UK basic wage growth hits record high of 7.8%',
  'content': 'Basic wages in UK hit a new record growth rate of 7.8% fuelling forecasts that the Bank of England will be forced to raise interest rates again to calm inflation. Official data showed some fresh signs of cooling in the jobs market with the unemployment rate rising to 4.2% from 4%, the highest since the three months to October 2021.',
  'category': 'business'},
 {'title': 'Nigeria inflation rises to 18-year high',
  '

In [22]:
inshorts_article_df = pd.DataFrame(inshorts)
inshorts_article_df

Unnamed: 0,title,content,category
0,Cybersecurity firm Secureworks to lay off 15% ...,Secureworks said it will lay off 15% of its wo...,business
1,UK basic wage growth hits record high of 7.8%,Basic wages in UK hit a new record growth rate...,business
2,Nigeria inflation rises to 18-year high,Nigeria's annual inflation rose to its highest...,business
3,Gangwal to sell IndiGo stake worth $450 mn via...,Indigo airline promoter Gangwal family led by ...,business
4,No proposal to restrict more electronics' impo...,There's no proposal at present to impose impor...,business
5,"Infosys, Liberty Global ink $1.6-bn deal to sc...",Infosys and Liberty Global announced a five-ye...,business
6,Cello World files IPO papers with SEBI to rais...,Household products maker Cello World has filed...,business
7,Binance files for protective order against US SEC,Binance filed for a protective court order aga...,business
8,India taking part in global economy brought st...,Prime Minister Narendra Modi said India's part...,business
9,"Michael Burry's Scion exits Alibaba, JD.com st...",Michael Burry's Scion Asset Management exited ...,business


In [23]:
inshorts_article_df.to_csv('news_articles.csv', index=False)

In [35]:
soup.find_all('div', class_='QqxB5Ed9A73reHbF2fQC')

[]