In [1]:
#Imports
!pip install transformers newspaper3k feedparser nltk schedule boto3

[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [10]:
# Download and imports
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from newspaper import Article
import string
import re
import feedparser
import numpy as np
import nltk.data
import json
import boto3
import schedule
from datetime import datetime
import time

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# AWS Setup

from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

In [6]:
#Raw Code

sentenceTokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")



def predict(text):
    tokens = tokenizer.encode(text)
    tokens_tensor = torch.tensor([tokens])
    return model(tokens_tensor)

def parsetext(url):
    article = Article(url)
    article.download()
    article.parse()
    t = article.text
    h = article.title
    return preprocess(t), preprocess(h)

def preprocess(text):
    x = text.translate(str.maketrans('', '', string.punctuation.replace(".","").replace("-",""))).replace("\n"," ")
    x = re.sub(' +', ' ',x)
    return x.lower()

def predict_article(text):
    vals = np.zeros(shape=(5,))
    avg = 0
    splitText = sentenceTokenizer.tokenize(text)
    for sentence in splitText:
        pred = np.argmax(predict(sentence)[0][0].detach().numpy())
        avg += pred
    try:
        avg /= len(splitText)
    except ZeroDivisionError:
        return -1
    return round((avg-2)/2, 3)

def get_good_news(urls):
    good_news = {}
    for url in urls:
        text, headline = parsetext(url)
        pred_2 = np.argmax(predict(headline)[0][0].detach().numpy())
        if pred_2 >= 4:
            pred = predict_article(text)
            if pred > 0.2:
                good_news[url] = pred
                print(headline, pred, pred_2)
    return good_news

def getInfo(url):
    try:
        metadata = {}
        
        x = Article(url)
        x.download()
        x.parse()
        
        md = x.meta_data
        #print(md)
        metadata["url"] = url
        metadata["title"] = x.title
        metadata["description"] = md["description"]
        try:
            metadata["publisher"] = md["og"]["site_name"]
        except KeyError:
            metadata["publisher"] = url.split("/")[2]
        try:
            metadata["publish_date"] = md["article"]["published_time"].split("T")[0]
        except KeyError:
            try:
                metadata["publish_date"] = md["article"]["published"].split("T")[0]
            except KeyError:
                metadata["publish_date"] = md["pubdate"].split("T")[0]
        metadata["thumbnail"] = x.top_img
        try:
            metadata["tags"] = md["article"]["tag"].split(",")
        except KeyError:
            metadata["tags"] = ["none"]
        

        print("\n\n\n\n\n")
        return metadata
    except:
        return None



rawrss = [
    "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", 
    "https://www.theguardian.com/world/rss", 
    "http://rss.cnn.com/rss/cnn_world.rss",
    "http://feeds.bbci.co.uk/news/world/rss.xml", 
    "https://abcnews.go.com/abcnews/topstories", 
    "https://www.cnbc.com/id/100727362/device/rss/rss.html",
    "https://feeds.a.dj.com/rss/RSSWorldNews.xml"
]

def store(file_name):
    boto3.Session().resource('s3').Bucket("goodnewsbucket").Object("data/" + datetime.utcnow().strftime("%Y-%m-%d") + ".json").upload_file(file_name)

def scheduled():
    feeds = []

    for url in rawrss:
        feeds.append(feedparser.parse(url))

    urls = []
    for feed in feeds:
        for post in feed.entries:
            urls.append(post.link)

    good_news = get_good_news(urls)
    print(good_news)


    article_list = []
    for url in good_news:
        print(url)
        info = getInfo(url)
        if(info != None):
            article_list.append(getInfo(url))
            article_list[-1]["sentimentrank"] = good_news[url]

    with open("articles.json", "w") as j:
        json.dump(article_list, j)
        
        
    store('articles.json')

In [7]:
scheduled()

we all live in bubbles now. how safe is yours 0.25 4
sugarcoating and brutal honesty how leaders are handling coronavirus crisis 0.409 4
one wonderland 0.643 4
to manage the coronavirus in new york you need testing and tracking - the new york times 0.382 4
opinion the worms turn in my composting bin 0.359 4
opinion how joe biden can own health care 0.25 4
what ocean life can teach us about the coronavirus - the new york times 0.25 4
fiona apple is back and unbound let’s discuss 0.25 4
liberty select sabrina ionescu no. 1 in w.n.b.a. draft 0.583 4
loud louder loudest how classical music started to roar 0.214 4
11 stranger-than-fiction documentaries on netflix and hulu 0.3 4
when life gives you lemons make 19th-century lemon cake 0.6 4
i’m just like anne frank like indiana jones bob dylan continues return to new songs 0.625 4
india primed what amazons vast new hyderabad campus reveals about its plans 0.355 4
i accepted the very first patient one nurses first week at nhs nightingale – vid

In [None]:
#
# For our project, we decided to use AWS SageMaker for rapid prototyping. Because of this, we needed to quickly
# convert our machine learning code into a usable backend at a moments notice. While the scheduling solution below
# obviously isn't ideal, it worked well with the rest of our development workflow. If I were to make a more
# "production"-scale deployment, I would switch to Docker containers and scheduled EC2 instances to handle the task.
#
schedule.every().day.at("00:00").do(scheduled)

In [None]:
while True:
    schedule.run_pending()
    time.sleep(300)