In [None]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import datetime, timedelta

#BERT
from transformers import pipeline
# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')
import ssl

#Summary
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

#Capitalize the Sentences
import textwrap
import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

import pymongo
from pymongo import MongoClient

from bs4 import BeautifulSoup 
import re
import urllib.request, urllib.parse, urllib.error

In [None]:
datedata=pd.read_csv("./LastRunDate.csv")
pastdate=datetime.fromtimestamp(datedata.loc[datedata["Scraper and Model"]=="techcrunch", "Date"])

In [None]:
driver=webdriver.Chrome(executable_path="./chromedriver_win32/chromedriver")
driver.get("https://techcrunch.com/")

In [None]:
for i in range(3):
    wait=WebDriverWait(driver, 100)
    load=wait.until(EC.presence_of_element_located((By.XPATH, '//span[contains(@class, "gradient-text gradient-text--green-gradient")]')))
    load.click()
wait=WebDriverWait(driver, 100)
load=wait.until(EC.presence_of_element_located((By.XPATH, '//span[contains(@class, "gradient-text gradient-text--green-gradient")]')))

In [None]:
titles=driver.find_elements_by_class_name('post-block__title')
dates=[]
print("News Scraped:", len(titles))
for i in range(len(titles)):
    titles[i]=titles[i].find_element_by_class_name('post-block__title__link').get_attribute('href')

In [None]:
lastdate=pastdate
news=[]
for title in titles:
#     Using Selenium
    li=[]
    driver.get(title)
    
    wait=WebDriverWait(driver, 100)
    load=wait.until(EC.presence_of_element_located((By.XPATH, '//time[contains(@class, "full-date-time")]')))
    utcdiff=datetime.now()-datetime.utcnow()
    try:
        newsdate=datetime.strptime(driver.find_element_by_class_name('full-date-time').get_attribute('datetime'), '%Y-%m-%dT%H:%M:%S')+utcdiff
    except:
        continue
    
    if newsdate<=pastdate:
        continue

    if lastdate<newsdate:
        lastdate=newsdate
    
    try:
        li.append(driver.find_element_by_class_name('article__title').text)
        li.append(title)
    except:
        continue
    
    try:
        li.append(driver.find_element_by_class_name('article__featured-image').get_attribute('src'))
    except:
        li.append('')
        
    content=driver.find_element_by_class_name('article-content').find_elements_by_xpath("./*")
    li.append(content[0].text)
    s=""
    for i in range(1, len(content)):
        s=s+content[i].text+'\n'
    if len(s)==0:
        continue
    li.append(s)
    li.append(0)
    li.append(newsdate)
    li.append('technology')    
    
    news.append(li)

In [None]:
driver.close()
driver.quit()

In [None]:
datedata=pd.read_csv("./LastRunDate.csv")
datedata.loc[datedata["Scraper and Model"]=="techcrunch", "Date"]=datetime.timestamp(lastdate)
datedata.to_csv("LastRunDate.csv", index=False)
print("Latest News Found:", lastdate)

In [None]:
df=pd.DataFrame(news, columns=['title', 'link', 'image_link', 'summary', 'content', "positivity_score", 'date', 'genre'])
df.head()

In [None]:
#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
        row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
        row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
        #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
            repl_url = url.group(3)
            row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
        except:
            pass #there might be emails with no url in them
        

        
        row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        row.strip().replace("\n","")
        
        yield row

In [None]:
X = df.values
for i in range(len(X)):
    content = X[i][4]

    preprocess_text = content.strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text

    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)


    # summmarize 
    summary_ids = model.generate(tokenized_text,
                                        num_beams=4,
                                        no_repeat_ngram_size=2,
                                        min_length=50,
                                        max_length=200,
                                        early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    sentences = sent_tokenizer.tokenize(output)
    sentences = [sent.capitalize() for sent in sentences]
    output=""
    for sent in sentences:
        output+=sent+' '
    X[i][3]=output.strip()
    print("Summarised:", i+1)

In [None]:
for news_ in X:
    y={'label': 'NEGATIVE', 'score': 1}
    try:
        y = classifier(news_[4][:2000])[0]
    except:
        pass
    z = classifier(news_[3])[0]
    news_[5] = max(y['score']/2+0.5 if y['label']=='POSITIVE' else (0.5-y['score']/2), z['score']/2+0.5 if z['label']=='POSITIVE' else (0.5-z['score']/2)) 
    news_[5] = np.int(np.float(news_[5])*100) + 25
    news_[5] = min(news_[5], 100)
pd.DataFrame(X).head()

In [None]:
dataset=pd.DataFrame(X, columns=['title', 'link', 'image_link', 'summary', 'content', "positivity_score", 'date', 'genre'])
dataset=dataset.drop(["content"], axis=1)
dataset

In [None]:
username=""
password=""

In [None]:
def get_database():
    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = f""
    try:
        conn = MongoClient(CONNECTION_STRING, ssl_cert_reqs=ssl.CERT_NONE)
        print("Connected successfully!!!")
        return conn.firstlight
    except:  
        print("Could not connect to MongoDB")
        return

In [None]:
newsdata = dataset.to_dict('records')
print("News Filtered: ", len(newsdata))

In [None]:
db = get_database()
try:
    db["news"].insert_many(newsdata)
    print("Success")
except Exception as e:
    print(e)

In [None]:
# db['news'].create_index("date", expireAfterSeconds=31*24*60*60)