In [1]:
from bs4 import BeautifulSoup
import urllib.request, urllib.error, urllib.parse
import pandas as pd
from selenium import webdriver
from dateutil import parser

In [2]:
cnn_url = 'https://edition.cnn.com'
world_url = 'https://edition.cnn.com/world'
politics_url = 'https://edition.cnn.com/politics'
health_url = 'https://edition.cnn.com/health'
entertainment_url = 'https://edition.cnn.com/entertainment'

In [3]:
def getParseUrl(link):
    driver = getDriver()
    driver.get(link)
    webContent = driver.page_source
    
    data = BeautifulSoup(webContent, 'html.parser')
    driver.close()
    return data

In [4]:
def getParseUrlStatic(link):
    response = urllib.request.urlopen(link)
    webContent = response.read()
    
    data = BeautifulSoup(webContent, 'html.parser')
    return data

In [5]:
def getNewsLinks(data):
    headlines = data.find_all(class_ = "cd__headline")
    links = []
    for headline in headlines:
        link = headline.a["href"]
        if link[:5] != 'https':
            link = cnn_url + link
        links.append(link)
    return links

In [6]:
def getDriver():
    driver = webdriver.Chrome(executable_path=r'C:\Users\DELL\Downloads\chromedriver_win32_5\chromedriver')
    return driver

In [7]:
def getStringFromTags(tag):
    data = tag.contents
    string = ""
    for content in data:
        try:
            string += content
        except:
            string += getStringFromTags(content)
    return string

In [8]:
def cleanDate(date):
    date = date.split()
    date = date[5] + " " + date[6] + " " + date[7] + " " + date[1]
    date = parser.parse(date).strftime('%Y-%m-%dT%H:%M.000Z')
    return date

In [9]:
def getNewsFromLink(link, genre):
    news = {}
    try:
        data = getParseUrlStatic(link)
        news['title'] = data.h1.contents[0]
        news['summary'] = ""
        divs = data.find_all(True, {'class': ["zn-body__paragraph", "Paragraph__component BasicArticle__paragraph BasicArticle__pad", "Paragraph__component"]})
        for div in divs:
            news['summary'] += getStringFromTags(div)
        news['image_link'] = data.find('img')['data-src-mini']
        news['link'] = link
        news['date'] = cleanDate(data.find_all('p', class_='update-time')[0].contents[0])
        news['positivity_score'] = 80
        news['genre'] = genre
        if genre == 'world':
            news['genre'] = 'politics'
        if news['summary'] == '':
            news = {}
        news['summary'] = news['summary'].split('(CNN)', 1)[1]
    except:
        news = {}
        print("Cannot get this news " + link)
    return news

In [10]:
def getDataFromUrl(url, genre):
    data = getParseUrl(url)
    links = getNewsLinks(data)
    news = [getNewsFromLink(link, genre) for link in links if link[:23] == 'https://edition.cnn.com']
    return news

In [11]:
news = []
for genre in ['world', 'politics', 'entertainment']:
    url = cnn_url + '/' + genre
    print(genre)
    cur_news = getDataFromUrl(url, genre)
    for new in cur_news:
        news.append(new)

world
Cannot get this news https://edition.cnn.com/travel/article/pompeii-slave-room-intl-scli/index.html
Cannot get this news https://edition.cnn.com/2021/11/08/asia/india-zika-outbreak-kanpur-intl-hnk/index.html
Cannot get this news https://edition.cnn.com/videos/world/2021/11/09/china-satellite-image-aircraft-carrier-type-003-ripley-earlystart-intl-hnk-vpx.cnn
Cannot get this news https://edition.cnn.com/2021/11/08/politics/defense-department-family-members-evacuate-afghanistan/index.html
Cannot get this news https://edition.cnn.com/2021/11/09/europe/poland-belarus-border-migrants-crisis-intl/index.html
Cannot get this news https://edition.cnn.com/2021/11/09/europe/princess-charlene-monaco-return-scli-intl/index.html
Cannot get this news https://edition.cnn.com/2021/11/09/europe/denmark-restrictions-europe-covid-intl/index.html
Cannot get this news https://edition.cnn.com/2021/11/08/europe/poland-belarus-border-migrants-intl/index.html
Cannot get this news https://edition.cnn.com/20

Cannot get this news https://edition.cnn.com/2021/11/08/world/cop26-day-8-takeaways-climate-intl/index.html
Cannot get this news https://edition.cnn.com/2021/11/08/africa/ethiopia-detentions-ethnicity-rights-commission-intl/index.html
Cannot get this news https://edition.cnn.com/2021/11/07/americas/nicaragua-election-nov-7-intl-latam/index.html
Cannot get this news https://edition.cnn.com/videos/world/2021/10/22/poland-belarus-eu-border-pushback-pleitgen-pkg-ovn-intl-hnk-vpx.cnn
Cannot get this news https://edition.cnn.com/videos/world/2021/11/08/vladimir-putin-bill-burns-cia-director-meeting-russia-ukraine-sciutto-nr-vpx.cnn
Cannot get this news https://edition.cnn.com/2013/08/22/world/daylight-saving-time-fast-facts/index.html
Cannot get this news https://edition.cnn.com/2021/11/08/africa/sierra-leone-tanker-blast-115-dead-intl/index.html
Cannot get this news https://edition.cnn.com/videos/tv/2021/11/07/exp-gps-1107-fareeds-take-renewable-energy-climate.cnn
Cannot get this news https

In [12]:
len(news)

182

In [19]:
pd.DataFrame(news).dropna()

Unnamed: 0,title,summary,image_link,link,date,positivity_score,genre
8,"Communists have now left the Czech parliament,...",When the Czech Republic's 200 newly-elected de...,//cdn.cnn.com/cnnnext/dam/assets/211107170329-...,https://edition.cnn.com/2021/11/08/europe/czec...,2021-11-08T16:52.000Z,80.0,politics
10,"Israel doubles down against US, saying there i...",Sitting next to the Israeli Foreign Minister i...,//cdn.cnn.com/cnnnext/dam/assets/211107074150-...,https://edition.cnn.com/2021/11/07/middleeast/...,2021-11-07T13:13.000Z,80.0,politics
18,'Go be happy': Thousands of baby river turtles...,"Some 3,000 baby river turtles native to the Am...",//cdn.cnn.com/cnnnext/dam/assets/211106064233-...,https://edition.cnn.com/2021/11/06/americas/ba...,2021-11-06T11:08.000Z,80.0,politics
19,"Marília Mendonça, chart-topping Brazilian musi...",Brazilian country music star Marília Mendonça ...,//cdn.cnn.com/cnnnext/dam/assets/211105200534-...,https://edition.cnn.com/2021/11/05/americas/ma...,2021-11-06T03:00.000Z,80.0,politics
20,Campaigners make last effort to save man with ...,Campaigners are making a last ditch effort to ...,//cdn.cnn.com/cnnnext/dam/assets/211107231807-...,https://edition.cnn.com/2021/11/08/asia/singap...,2021-11-09T09:47.000Z,80.0,politics
...,...,...,...,...,...,...,...
174,'Colin in Black & White' explores Colin Kaeper...,Colin Kaepernick possesses greater gifts as an...,//cdn.cnn.com/cnnnext/dam/assets/211025215218-...,https://edition.cnn.com/2021/10/27/entertainme...,2021-10-27T14:19.000Z,80.0,entertainment
175,'Queens' gets the band back together for a soa...,"""Queens"" is an extremely familiar idea, elevat...",//cdn.cnn.com/cnnnext/dam/assets/210902034712-...,https://edition.cnn.com/2021/10/19/entertainme...,2021-10-19T17:09.000Z,80.0,entertainment
176,'Succession' doesn't miss a beat as its Murdoc...,"Success hasn't spoiled ""Succession,"" as the Em...",//cdn.cnn.com/cnnnext/dam/assets/211012181811-...,https://edition.cnn.com/2021/10/15/entertainme...,2021-10-15T12:11.000Z,80.0,entertainment
177,'The Wonder Years' shines in ABC's reboot of t...,Early '90s TV is hitting the refresh button th...,//cdn.cnn.com/cnnnext/dam/assets/210902034520-...,https://edition.cnn.com/2021/09/22/entertainme...,2021-09-22T15:45.000Z,80.0,entertainment
