In [1]:
from bs4 import BeautifulSoup
import urllib.request, urllib.error, urllib.parse
import pandas as pd
from selenium import webdriver

In [2]:
bbc_url = 'https://www.bbc.com'
bbc_url_news = 'https://www.bbc.com/news'
world_url = 'https://edition.cnn.com/world'
politics_url = 'https://edition.cnn.com/politics'
health_url = 'https://edition.cnn.com/health'
entertainment_url = 'https://edition.cnn.com/entertainment'

In [3]:
def getParseUrl(link, genre):
    driver = getDriver()
    driver.get(link + '/news')
    button = driver.find_element_by_link_text(genre)
    button.click()
    webContent = driver.page_source
    data = BeautifulSoup(webContent, 'html.parser')
    driver.close()
    return data

In [4]:
def getParseUrlStatic(link):
    response = urllib.request.urlopen(link)
    webContent = response.read()
    
    data = BeautifulSoup(webContent, 'html.parser')
    return data

In [5]:
def getNewsLinks(data):
    headlines = data.find_all(class_ = "gs-c-promo-heading")
    links = []
    for headline in headlines:
        try:
            link = headline["href"]
            if link[:5] != 'https':
                link = bbc_url + link
            links.append(link)
        except:
            print("Cannot find the link")
    return links

In [6]:
def getDriver():
    driver = webdriver.Chrome(executable_path=r'C:\Users\DELL\Downloads\chromedriver_win32_5\chromedriver')
    return driver

In [7]:
def getStringFromTags(tag):
    data = tag.contents
    string = ""
    for content in data:
        try:
            string += content
        except:
            string += getStringFromTags(content)
    return string

In [8]:
def getAppropriateGenre(genre):
    if genre == 'Tech':
        genre = 'technology'
    elif genre == 'Climate' or genre == 'Science':
        genre = 'science'
    elif genre == 'World' or genre == 'Asia':
        genre = 'politics'
    else:
        genre = 'entertainment'
    return genre

In [9]:
def cleanTitle(title):
    if title[:6] == 'COP26:':
        title = title.split('COP26: ', 1)[1]
    return title

In [10]:
def getNewsFromLink(link, genre):
    news = {}
    try:
        data = getParseUrlStatic(link)
        news = {}
        news['title'] = cleanTitle(data.find('h1', {'id': 'main-heading'}).contents[0])
        news['summary'] = ""
        divs = data.findAll('div', {'data-component': 'text-block'})
        for div in divs:
            news['summary'] += getStringFromTags(div)
        news['image_link'] = data.find('img')['src']
        news['link'] = link
        news['positivity_score'] = 80
        news['date'] = data.find('time')['datetime']
        news['genre'] = getAppropriateGenre(genre)
        if news['summary'] == '':
            news = {}
    except:
        print("Cannot get this news " + link)
    return news

In [11]:
def getDataFromUrl(url, genre):
    data = getParseUrl(url, genre)
    links = getNewsLinks(data)
    news = [getNewsFromLink(link, genre) for link in links]
    return list(filter(None, news))

In [12]:
news = []
for genre in ['Climate', 'World', 'Asia', 'Tech', 'Business', 'Science', 'Entertainment & Arts']:
    cur_news = getDataFromUrl(bbc_url, genre)
    for new in cur_news:
        news.append(new)

Cannot get this news https://www.bbc.co.uk/news/resources/idt-d6338d9f-8789-4bc2-b6d7-3691c0e7d138


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Cannot get this news https://www.bbc.co.uk/sustainability/
Cannot get this news https://www.bbc.com/future/article/20211105-how-carbon-might-go-out-of-fashion
Cannot get this news https://www.bbc.com/worklife/article/20211103-the-toll-of-being-left-behind-at-work
Cannot get this news https://www.bbc.com/travel/article/20211107-inchconnachan-the-british-island-where-wallabies-rule
Cannot get this news https://www.bbc.com/culture/article/20211108-cop26-the-ancient-origins-of-the-colour-green
Cannot get this news https://www.bbc.com/future/article/20211103-the-countries-calling-for-climate-justice
Cannot get this news https://www.bbc.com/worklife/article/20211105-the-parents-who-track-their-children
Cannot get this news https://www.bbc.com/travel/article/20211104-four-us-cities-ready-to-welcome-back-travellers-safely
Cannot get this news https://www.bbc.co.uk/news/live/world-africa-47639452?ns_mchannel=social&ns_source=twitter&ns_campaign=bbc_live&ns_linkname=618aaa191b17f0262dc3e895%26Th

In [13]:
len(news)

201

In [16]:
pd.DataFrame(news).dropna()

Unnamed: 0,title,summary,image_link,link,positivity_score,date,genre
0,World headed for 2.4C warming despite climate ...,Despite pledges made at the climate summit COP...,https://ichef.bbci.co.uk/news/976/cpsprodpb/13...,https://www.bbc.com/news/science-environment-5...,80,2021-11-09T14:01:50.000Z,science
1,World headed for 2.4C warming despite climate ...,Despite pledges made at the climate summit COP...,https://ichef.bbci.co.uk/news/976/cpsprodpb/13...,https://www.bbc.com/news/science-environment-5...,80,2021-11-09T14:01:50.000Z,science
2,Climate change: What do scientists want from C...,As the COP26 climate summit enters its second ...,https://ichef.bbci.co.uk/news/976/cpsprodpb/AB...,https://www.bbc.com/news/science-environment-5...,80,2021-11-09T01:21:41.000Z,science
3,Climate change: Seven ways to spot businesses ...,Most of us are trying to be greener and for so...,https://ichef.bbci.co.uk/news/976/cpsprodpb/BC...,https://www.bbc.com/news/business-59119693,80,2021-11-08T23:58:05.000Z,science
4,Obama tells young people to stay angry on clim...,"Barack Obama has called on young people to ""st...",https://ichef.bbci.co.uk/news/976/cpsprodpb/E9...,https://www.bbc.com/news/science-environment-5...,80,2021-11-08T19:29:39.000Z,science
...,...,...,...,...,...,...,...
196,Why Tanzanian Nobel laureate Abdulrazak Gurnah...,Zanzibar-born Abdulrazak Gurnah became the mos...,https://ichef.bbci.co.uk/news/976/cpsprodpb/11...,https://www.bbc.com/news/world-africa-59178826,80,2021-11-08T00:35:50.000Z,entertainment
197,How pop star Zara Larsson made a seven-figure ...,Pop star Zara Larsson says she's made a seven-...,https://ichef.bbci.co.uk/news/976/cpsprodpb/17...,https://www.bbc.com/news/entertainment-arts-59...,80,2021-11-05T01:25:08.000Z,entertainment
198,Dug Dug: A film inspired by India's motorcycle...,"A new film, which is based on a bizarre story ...",https://ichef.bbci.co.uk/news/976/cpsprodpb/10...,https://www.bbc.com/news/world-asia-india-5903...,80,2021-11-08T00:17:25.000Z,entertainment
199,Voyage: 'Vintage Abba' or 'bafflingly retrogra...,"Forty years after their last studio album, Abb...",https://ichef.bbci.co.uk/news/976/cpsprodpb/12...,https://www.bbc.com/news/entertainment-arts-59...,80,2021-11-05T10:52:53.000Z,entertainment
