# News scraping from Bing News

Code reference: https://github.com/miguelfzafra/Latest-News-Classifier

News source: https://www.bing.com/news

In [1]:
# Import modules
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

# Obtain news from the page

In [2]:
# Request, see if success
url = "https://www.bing.com/news"
req = requests.get(url)
req.status_code

200

In [3]:
# Get the content on the page
page = req.content
# Soup creation
soup = BeautifulSoup(page, 'html5lib')

In [4]:
# News identification
news = soup.find_all('a', class_ = 'title')
number_of_articles = len(news)
number_of_articles

14

In [8]:
# See an example of the structure
print(news[1])

<a class="title" data-artpy="0" data-author="Associated Press" h="ID=news,5263.1" href="https://www.msn.com/en-us/news/politics/ap-fact-check-biden-admin-wrong-on-vaccine-pace-elderly/ar-BB1esKeP?ocid=BingNews" target="_blank">AP FACT CHECK: Biden admin wrong on vaccine pace, elderly</a>


# Extract text from articles

In [6]:
# Empty lists for content, links and titles
links_list = []
titles_list = []
contents_list = []
author_list = []

for i in np.arange(0, len(news)):
    if (news[i]['href'] != ''):
        # Get the link for the news article
        link= news[i]['href']
        links_list.append(link)

        # Get the title of the news article
        title = news[i].get_text()
        titles_list.append(title)

        # Get the content of each article
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        x = soup_article.find_all('p')

        # Unifying the paragraphs
        list_paragraphs = []
        for p in np.arange(0, len(x)):
            paragraph = x[p].get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)
        contents_list.append(final_article)
        
        # Get the news author
        link= news[i]['data-author']
        author_list.append(link)

# Results

In [7]:
df = pd.DataFrame(
     {'Title': titles_list,
      'Author': author_list,
      'Content': contents_list,
      'Link': links_list
    })
df

Unnamed: 0,Title,Author,Content,Link
0,"Congress Clears $1.9 Trillion Aid Bill, Sendin...",The New York Times,WASHINGTON — Congress gave final approval on W...,https://www.msn.com/en-us/news/politics/congre...
1,AP FACT CHECK: Biden admin wrong on vaccine pa...,Associated Press,WASHINGTON (AP) — For an administration that p...,https://www.msn.com/en-us/news/politics/ap-fac...
2,Trump insists 'something bad happened' in Ga. ...,ABC News,Mexico's Congress approves landmark cannabis b...,https://www.msn.com/en-us/news/politics/trump-...
3,Joe Manchin on his veto power over Biden agend...,CNN,He's undecided on the nominee to head Health ...,https://www.msn.com/en-us/news/politics/joe-ma...
4,Texas AG threatens to sue city of Austin for d...,NBC News,Texas Attorney General Ken Paxton threatened W...,https://www.msn.com/en-us/news/us/texas-ag-thr...
5,LSU president chastised for insufficient punis...,USA TODAY,BATON ROUGE – Enraged Louisiana legislators an...,https://www.msn.com/en-us/news/us/lsu-presiden...
6,Report: Police chief violated bias policy by m...,Associated Press,"LIHUE, Hawaii (AP) — The police chief on the H...",https://www.msn.com/en-us/news/us/report-polic...
7,"Sea turtles, too weak to swim, are coming asho...",USA TODAY,Mexico's Congress approves landmark cannabis b...,https://www.msn.com/en-us/news/us/sea-turtles-...
8,Report: Cuomo groped female aide in governor's...,Associated Press,"ALBANY, N.Y. (AP) — An aide to New York Gov. A...",https://www.msn.com/en-us/news/politics/report...
9,Senate confirms Marcia Fudge and Michael Regan...,The Guardian,The Senate on Wednesday confirmed Marcia Fudge...,https://www.msn.com/en-us/news/politics/senate...
