In [None]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import datetime, timedelta

#BERT
from transformers import pipeline
# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')
import ssl

#Summary
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

#Capitalize the Sentences
import textwrap
import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

import pymongo
from pymongo import MongoClient

from bs4 import BeautifulSoup 
import re
import urllib.request, urllib.parse, urllib.error

In [None]:
cnn_url = 'https://edition.cnn.com'
world_url = 'https://edition.cnn.com/world'
politics_url = 'https://edition.cnn.com/politics'
health_url = 'https://edition.cnn.com/health'
entertainment_url = 'https://edition.cnn.com/entertainment'
india_url ='https://edition.cnn.com/india'
business_url = 'https://edition.cnn.com/business'
sports_url = 'https://edition.cnn.com/sport'
technology_url = 'https://edition.cnn.com/business/tech'

In [None]:
datedata=pd.read_csv("./LastRunDate.csv")
pastdate=datetime.fromtimestamp(datedata.loc[datedata["Scraper and Model"]=="cnn", "Date"])
lastdate=pastdate

In [None]:
def getParseUrl(link):
    driver = getDriver()
    driver.get(link)
    webContent = driver.page_source
    
    data = BeautifulSoup(webContent, 'html.parser')
    driver.close()
    return data

In [None]:
def getParseUrlStatic(link):
    response = urllib.request.urlopen(link)
    webContent = response.read()
    
    data = BeautifulSoup(webContent, 'html.parser')
    return data

In [None]:
def getNewsLinks(data):
    headlines = data.find_all(class_ = "cd__headline")
    links = []
    for headline in headlines:
        link = headline.a["href"]
        if link[:5] != 'https':
            link = cnn_url + link
        links.append(link)
    return links

In [None]:
def getDriver():
    driver = webdriver.Chrome(executable_path=r'./chromedriver_win32/chromedriver')
    return driver

In [None]:
def getStringFromTags(tag):
    data = tag.contents
    string = ""
    for content in data:
        try:
            string += content
        except:
            string += getStringFromTags(content)
    return string

In [None]:
def cleanDate(date):
    date = date.split()
    date = date[5] + " " + date[6] + " " + date[7] + " " + date[1]
    utcdiff=datetime.now()-datetime.utcnow()
    date =  datetime.strptime(date, "%B %d, %Y %H%M") + utcdiff
    return date

In [None]:
def getNewsFromLink(link, genre):
    news = {}
    try:
        data = getParseUrlStatic(link)
        news['title'] = data.h1.contents[0]
        news['summary'] = ""
        divs = data.find_all(True, {'class': ["zn-body__paragraph", "Paragraph__component BasicArticle__paragraph BasicArticle__pad", "Paragraph__component"]})
        for div in divs:
            news['summary'] += getStringFromTags(div)
        try:
            news['image_link'] = data.find('img')['data-src-full16x9']
        except:
            news['image_link'] = ''
        if news['image_link'] == '':
            try:
                news['image_link'] = data.find('img')['data-src-large']
            except:
                news['image_link'] = ''
        if news['image_link'] == '':
            try:
                news['image_link'] = data.find('img')['data-src-medium']
            except:
                news['image_link'] = ''
        if news['image_link'] == '':
            try:
                news['image_link'] = data.find('img')['data-src-small']
            except:
                news['image_link'] = ''
        if news['image_link'] == '':
            try:
                news['image_link'] = data.find('img')['data-src-mini']
            except:
                news['image_link'] = ''
        if news['image_link'] == '':
            try:
                news['image_link'] = data.find('img')['src']
            except:
                news['image_link'] = ''
        if news['image_link'] == '' or (news['image_link'][-4:]!='.jpg' and news['image_link'][-4:]!='.png' and news['image_link'][-4:]!='.gif' and news['image_link'][-5:]!='.jpeg'):
            try:
                news['image_link'] = data.find_all('img')[1]['src']
            except:
                news['image_link'] = ''
        if news['image_link'] == '' or (news['image_link'][-4:]!='.jpg' and news['image_link'][-4:]!='.png' and news['image_link'][-4:]!='.gif' and news['image_link'][-5:]!='.jpeg'):
            news['image_link']=''
            
        news['image_link'] = news['image_link'].replace('e_blur:500,', '')
        news['image_link'] = news['image_link'].replace('q_auto:low,', '')
        news['image_link'] = news['image_link'].replace('w_50,', '')
        news['image_link'] = news['image_link'].replace('c_fill,', '')
        news['image_link'] = news['image_link'].replace('g_auto,', '')
        news['image_link'] = news['image_link'].replace('h_50,', '')
        news['image_link'] = news['image_link'].replace('h_28,', '')
        
        news['link'] = link
        global lastdate
        try: 
            news['date'] = cleanDate(data.find_all('p', class_='update-time')[0].contents[0])
        except:
            news['date'] = pastdate
        news['positivity_score'] = 0
        news['genre'] = genre
        if news['summary'] == '':
            news = {}
        try: 
            news['summary'] = news['summary'].split('(CNN)', 1)[1]
        except:
            news['summary'] = news['summary']

        if news['date']<=pastdate:
            news = {}
            # print("Old News " + link)
        
        if lastdate<news['date']:
            lastdate=news['date']
    except:
        news = {}
        # print("Cannot get this news " + link)
    return news

In [None]:
def getDataFromUrl(url, genre):
    data = getParseUrl(url)
    links = getNewsLinks(data)
    news = [getNewsFromLink(link, genre) for link in links if link[:23] == 'https://edition.cnn.com']
    return news

In [None]:
news = []
for genre in ['world', 'politics', 'entertainment', 'india', 'business', 'technology', 'health']:
    url = cnn_url + '/' + genre
    if genre=='technology':
        url=technology_url
    print(genre)
    cur_news = getDataFromUrl(url, genre)
    for new in cur_news:
        if len(new)>0:
            news.append(new)

In [None]:
datedata=pd.read_csv("./LastRunDate.csv")
datedata.loc[datedata["Scraper and Model"]=="cnn", "Date"]=datetime.timestamp(lastdate)
datedata.to_csv("LastRunDate.csv", index=False)
print("Latest News Found:", lastdate)

In [None]:
len(news)

In [None]:
news = pd.DataFrame(news).dropna()
news.drop_duplicates(keep='first', inplace=True)
news.drop_duplicates(['title', 'genre'], keep='first', inplace=True)
news = news.to_dict('records')

In [None]:
len(news)

In [None]:
for i in range(len(news)):
    x={'label': 'NEGATIVE', 'score': 1}
    try:
        x = classifier(news[i]["summary"])[0]
    except:
        pass
    z={'label': 'NEGATIVE', 'score': 1}
    try:
        z = classifier(' '.join(news[i]["summary"].split()[:300]))[0]
    except:
        z = classifier(' '.join(news[i]["summary"].split()[:150]))[0]
    news[i]["positivity_score"] = max(x['score']/2+0.5 if x['label']=='POSITIVE' else (0.5-x['score']/2), z['score']/2+0.5 if z['label']=='POSITIVE' else (0.5-z['score']/2))
    news[i]["positivity_score"] = int(float(news[i]["positivity_score"])*100)
    if news[i]["genre"]=="science" or news[i]["genre"]=='health':
        news[i]["positivity_score"] += 50
        news[i]["positivity_score"] = min(news[i]["positivity_score"], 100)
    if news[i]["genre"]=="technology" or news[i]["genre"]=="business" or news[i]["genre"]=="entertainment":
        news[i]["positivity_score"] += 25
        news[i]["positivity_score"] = min(news[i]["positivity_score"], 100)

In [None]:
pd.DataFrame(news)

In [None]:
#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
        row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
        row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
        #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
            repl_url = url.group(3)
            row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
        except:
            pass #there might be emails with no url in them
        

        
        row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        row.strip().replace("\n","")
        
        yield row

In [None]:
for i in range(len(news)):
    content=news[i]["summary"]

    preprocess_text = content.strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text

    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)


    # summmarize 
    summary_ids = model.generate(tokenized_text,
                                        num_beams=4,
                                        no_repeat_ngram_size=2,
                                        min_length=50,
                                        max_length=200,
                                        early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    sentences = sent_tokenizer.tokenize(output)
    sentences = [sent.capitalize() for sent in sentences]
    output=""
    for sent in sentences:
        output+=sent+' '
    news[i]["summary"]=output.strip()
    print("Summarised:", i+1)

In [None]:
pd.DataFrame(news)

In [None]:
username=""
password=""

In [None]:
def get_database():
    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = f""
    try:
        conn = MongoClient(CONNECTION_STRING, ssl_cert_reqs=ssl.CERT_NONE)
        print("Connected successfully!!!")
        return conn.firstlight
    except:  
        print("Could not connect to MongoDB")
        return

In [None]:
db = get_database()
try:
    db["news"].insert_many(news)
    print("Success")
except Exception as e:
    print(e)