In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

#BERT
#Positivity Score
from transformers import pipeline
# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')

#Summary
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

#Capitalize the Sentences
import textwrap
import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

import pymongo
from pymongo import MongoClient
import ssl

In [None]:
datedata=pd.read_csv("./LastRunDate.csv")
pastdate=datetime.fromtimestamp(datedata.loc[datedata["Scraper and Model"]=="ndtv", "Date"])
lastdate=pastdate

In [None]:
driver=webdriver.Chrome(executable_path="./chromedriver_win32/chromedriver")

In [None]:
def fetchLinks(gen):    
    driver.get("https://www.ndtv.com/" + gen)
    news_items = driver.find_elements_by_class_name("news_Itm")
    news_ads = driver.find_elements_by_class_name("adBg")
    all_news = []
    news_items = list(set(news_items) ^ set(news_ads))
    for news_item in news_items:
        news_item_img = news_item.find_element_by_class_name("news_Itm-img")
        news_link = news_item_img.find_element_by_tag_name("a")
        
        all_news.append(news_link.get_attribute("href"))


    # Moving to next pages:
    req = requests.get("https://www.ndtv.com/" + gen)
    soup = BeautifulSoup(req.text, "html.parser")

    pages = soup.find(class_ = "listng_pagntn clear")

    for anchor in pages.findAll("a")[1:10]:
        req = requests.get(anchor.attrs["href"])
        soup = BeautifulSoup(req.text, "html.parser")

        news_link_img = soup.find_all(class_ = "news_Itm-img")
        for news_link_img_itr in news_link_img:
            all_news.append(news_link_img_itr.a.attrs["href"])
    return all_news

In [None]:
news_gathered = []

In [None]:
def gather_news(news_gathered, all_news, lastdate, gen):
    for link in all_news:
        req = requests.get(link)
        soup = BeautifulSoup(req.text, "html.parser")

        soup = BeautifulSoup(requests.get(link).text, "html.parser")
        
        if(soup.find(class_ = "sp-ttl") and soup.find(class_ = "sp-descp") and soup.find(class_ = "ins_instory_dv_cont") and soup.find(class_ = "ins_storybody")):
            heading = soup.find(class_ = "sp-ttl").text

            description = soup.find(class_ = "sp-descp").text
            
            img_wrap = soup.find(class_ = "ins_instory_dv_cont")
            image_link = ""
            if(img_wrap.img):
                image_link = img_wrap.img.attrs["src"] 
            else:
                image_link = ""

            news_outerbody = soup.find(class_ = "ins_storybody")
            content = ""
            
            paragraphs = news_outerbody.find_all("p")
            
            date_wrap = soup.findAll(class_ = "pst-by_li")
            if(date_wrap[-1].span.meta.attrs["content"]):
                date = datetime.strptime(date_wrap[-1].span.meta.attrs["content"][0:-6], '%Y-%m-%dT%H:%M:%S')
            else:
                date = datetime.date.today().strftime("%Y-%m-%d")

            if date<=pastdate:
                continue
            if lastdate<date:
                lastdate=date

            content = []
            for para in paragraphs:
                content.append(para.text)

            content = "".join(content)
            
            news_gathered.append(
                {
                    "title": heading,
                    "link": link,
                    "image_link": image_link,
                    "summary": content,
                    "desc": description,
                    "positivity_score": 0,
                    "date": date,
                    "genre": gen
                }
            )
    return lastdate

In [None]:
lastdate = gather_news(news_gathered, fetchLinks("world-news"), lastdate, "world")
lastdate = gather_news(news_gathered, fetchLinks("india"), lastdate, "india")
lastdate = gather_news(news_gathered, fetchLinks("science"), lastdate, "science")
lastdate = gather_news(news_gathered, fetchLinks("offbeat"), lastdate, "offbeat")
print(lastdate)

In [None]:
driver.close()
driver.quit()

In [None]:
datedata=pd.read_csv("./LastRunDate.csv")
datedata.loc[datedata["Scraper and Model"]=="ndtv", "Date"]=datetime.timestamp(lastdate)
datedata.to_csv("LastRunDate.csv", index=False)
print("Latest News Found:", lastdate)

In [None]:
pd.DataFrame(news_gathered).head()

In [None]:
print(len(news_gathered))

In [None]:
news_gathered = pd.DataFrame(news_gathered).dropna()
news_gathered.drop_duplicates(keep='first', inplace=True)
news_gathered.drop_duplicates(['title', 'genre'], keep='first', inplace=True)
news_gathered = news_gathered.to_dict('records')

In [None]:
for i in range(len(news_gathered)):
    x={'label': 'NEGATIVE', 'score': 1}
    try:
        x = classifier(news_gathered[i]["summary"])[0]
    except:
        pass
    z=classifier(news_gathered[i]["desc"])[0]
    news_gathered[i]["positivity_score"] = max(x['score']/2+0.5 if x['label']=='POSITIVE' else (0.5-x['score']/2), z['score']/2+0.5 if z['label']=='POSITIVE' else (0.5-z['score']/2))
    news_gathered[i]["positivity_score"] = int(float(news_gathered[i]["positivity_score"])*100)
    if news_gathered[i]["genre"]=="science" or news_gathered[i]["genre"]=="offbeat":
        news_gathered[i]["positivity_score"] += 50
        news_gathered[i]["positivity_score"] = min(news_gathered[i]["positivity_score"], 100)
pd.DataFrame(news_gathered).head()

In [None]:
#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
        row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
        row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
        #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
            repl_url = url.group(3)
            row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
        except:
            pass #there might be emails with no url in them
        

        
        row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        row.strip().replace("\n","")
        
        yield row

In [None]:
for i in range(len(news_gathered)):
    content=news_gathered[i]["summary"]

    preprocess_text = content.strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text

    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)


    # summmarize 
    summary_ids = model.generate(tokenized_text,
                                        num_beams=4,
                                        no_repeat_ngram_size=2,
                                        min_length=50,
                                        max_length=200,
                                        early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    sentences = sent_tokenizer.tokenize(output)
    sentences = [sent.capitalize() for sent in sentences]
    output=""
    for sent in sentences:
        output+=sent+' '
    news_gathered[i]["summary"]=output.strip()
    print("Summarised:", i+1)


In [None]:
pd.DataFrame(news_gathered)

In [None]:
news_gathered = [{k: v for k, v in d.items() if k != 'desc'} for d in news_gathered]
pd.DataFrame(news_gathered).head()

In [None]:
username=""
password=""

In [None]:
def get_database():
    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = f""
    try:
        conn = MongoClient(CONNECTION_STRING, ssl_cert_reqs=ssl.CERT_NONE)
        print("Connected successfully!!!")
        return conn.firstlight
    except:  
        print("Could not connect to MongoDB")
        return

In [None]:
db = get_database()
try:
    db["news"].insert_many(news_gathered)
    print("Success")
except Exception as e:
    print(e)