In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd

#BERT
#Positivity Score
from transformers import pipeline
# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')

#Summary
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

#Capitalize the Sentences
import textwrap
import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
import re

import pymongo
from pymongo import MongoClient
import ssl

In [None]:
datedata=pd.read_csv("./LastRunDate.csv")
pastdate=datetime.fromtimestamp(datedata.loc[datedata["Scraper and Model"]=="sports", "Date"])
lastdate=pastdate

In [None]:
def convert_date(date_string):
    # February 7, 2022 10:07 AM
    
    datetime_object = datetime.strptime(date_string, "%B %d, %Y %H:%M %p")
    return (datetime_object)

In [None]:
def collect_links(page, last_run_date):
    stop = False
    req = requests.get("https://www.india.com/sports/page/{}".format(page))
    if(req.status_code == 200):
        soup = BeautifulSoup(req.text, "html.parser")
        try:
            section = soup.find(class_ = "listing-cities-news")
            news_items = section.findAll(class_ = "catPgListitem")
            news_links = []
            for item in news_items:
                try:
                    formatted_date = convert_date(item.find(class_ = "byline").text.split("\n")[2].strip()[:-4])
                    if(formatted_date < last_run_date):
                        stop = True
                        break
                    news_links.append(item.find("a").attrs["href"])
                except:
                    continue
            return news_links, stop
        except:
            print("Page Error.")

In [None]:
def update_last_run_date(date):
    datedata=pd.read_csv("./LastRunDate.csv")
    datedata.loc[datedata["Scraper and Model"]=="sports", "Date"]=datetime.timestamp(date)
    datedata.to_csv("LastRunDate.csv", index=False)

In [None]:
news_gathered=[]

In [None]:
newdate=lastdate
def gather_news(news_links):
    global newdate
    for news_link in news_links:
        req = requests.get(news_link)
        soup = BeautifulSoup(req.text, "html.parser")
        try:
            article = soup.find(class_ = "article-page")
            heading = article.find("h1").text
            date = convert_date(article.find(class_ = "authors-m").find("aside").text.split(": ")[1].strip()[:-4])
            if newdate<date:
                newdate=date
            image_wrapper = article.find(class_ = "content-wrap").figure.div.img
            image_link = image_wrapper.attrs["data-lazy-src"]
            content_body = article.find(class_ = "articleBody")
            paragraphs = content_body.findAll("p")
            content = ""
            for para in paragraphs:
                content += para.text.split("Also Read")[0]

            news_gathered.append(
                {
                    "title": heading,
                    "link": news_link,
                    "image_link": image_link,
                    "summary": content,
                    "positivity_score": 0,
                    "date": date,
                    "genre": "sports"
                }
            )
        except:
            continue
    return

In [None]:
page = 1
while(True):
    output = collect_links(page, lastdate)
    stop = output[1]
    news_links = output[0]
    
    gather_news(news_links)
    print(stop)
    if(stop):
        break
    else:
        page += 1

In [None]:
len(news_gathered)

In [None]:
update_last_run_date(newdate)

In [None]:
for i in range(len(news_gathered)):
    x={'label': 'NEGATIVE', 'score': 1}
    try:
        x = classifier(news_gathered[i]["summary"][:1500])[0]
    except:
        pass

    news_gathered[i]["positivity_score"] = x['score']/2+0.5 if x['label']=='POSITIVE' else (0.5-x['score']/2)
    news_gathered[i]["positivity_score"] = int(float(news_gathered[i]["positivity_score"])*100)
    if news_gathered[i]["genre"]=="science" or news_gathered[i]["genre"]=="offbeat":
        news_gathered[i]["positivity_score"] += 50
        news_gathered[i]["positivity_score"] = min(news_gathered[i]["positivity_score"], 100)
pd.DataFrame(news_gathered).head()

In [None]:
#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
        row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
        row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
        #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
            repl_url = url.group(3)
            row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
        except:
            pass #there might be emails with no url in them
        

        
        row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        row.strip().replace("\n","")
        
        yield row

In [None]:
for i in range(len(news_gathered)):
    content=news_gathered[i]["summary"]

    preprocess_text = content.strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text

    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)


    # summmarize 
    summary_ids = model.generate(tokenized_text,
                                        num_beams=4,
                                        no_repeat_ngram_size=2,
                                        min_length=50,
                                        max_length=200,
                                        early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    sentences = sent_tokenizer.tokenize(output)
    sentences = [sent.capitalize() for sent in sentences]
    output=""
    for sent in sentences:
        output+=sent+' '
    news_gathered[i]["summary"]=output.strip()
    print("Summarised:", i+1)


In [None]:
pd.DataFrame(news_gathered)

In [None]:
news_gathered = [{k: v for k, v in d.items() if k != 'desc'} for d in news_gathered]
pd.DataFrame(news_gathered).head()

In [None]:
username=""
password=""

In [None]:
def get_database():
    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = f"mongodb+srv://{username}:{password}@cluster0.oani9.mongodb.net/firstlight?retryWrites=true&w=majority"
    try:
        conn = MongoClient(CONNECTION_STRING, ssl_cert_reqs=ssl.CERT_NONE)
        print("Connected successfully!!!")
        return conn.firstlight
    except:  
        print("Could not connect to MongoDB")
        return

In [None]:
db = get_database()
try:
    db["news"].insert_many(news_gathered)
    print("Success")
except Exception as e:
    print(e)