### LabelStudio Evaluation Job Creation
* Scrape articles using newspaper3k
* Create articles summaries with spacy
* Translate text of non-english articles
* Output json struct compatible with label_studio_config.xml

In [5]:
import newspaper
import pandas as pd
from googletrans import Translator, constants
from transformers import pipeline
import pprint
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
import json

translator = Translator()

def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

def get_article_info(a, url):

    headline = a.title
    keywords = None
    summary = None
    lang = None
        
    if headline:
        try:
            a.download()
            a.parse()
            a.nlp()
        except:
            print('summarize failed: ', url)
        
        keywords = a.keywords
        summary = a.summary
        
        try:
            lang = translator.detect(headline).lang
            if  lang != 'en':
                headline = translator.translate(headline.replace('\n', '').replace('\t', '')).text
                keywords = [kw.text for kw in translator.translate(keywords)]
                summary = translator.translate(summary).text
        except:
            print('translation failed: ', url)

    return {
        'url': url,
        'title': headline,
        'keywords': keywords,
        'summary': summary,
        'lang': lang
    }

# url = 'https://www.12minutos.com/63c955cfcb8f0/creativa-modular-en-el-top-3-de-constructoras-mas-importantes-del-mundo.html'
url = 'https://www.adobochronicles.com/2019/02/17/feature-the-otter-side-of-the-exotic-animal-trade-via-facebook/'
a = newspaper.Article(url)
a.download()
a.parse()
pprint.pprint(get_article_info(a, a.url))
print(a.title)
print(Translator().translate("UK: Η Meta λαμβάνει την τελική εντολή να πουλήσει την Giphy").text)

{'keywords': ['trade',
              'trend',
              'animals',
              'facebook',
              'otter',
              'animal',
              'thailand',
              'exotic',
              'otters',
              'fact',
              'wild',
              'feature',
              'taking'],
 'lang': 'en',
 'summary': '(Editor’s note: Every now and then, The Adobo Chronicles gets '
            'serious about issues regarding the destruction of our environment '
            'or removing wild animals from their natural habitat.\n'
            'There are many promoters that can be found on social media '
            'websites that are looking to sell exotic animals.\n'
            'From time immemorial there have been many exotic animals that '
            'people have kept such as snakes, flying squirrels, Mexican '
            'red-legged tarantulas, thus making the otter trade only the '
            'latest trend.\n'
            'In Thailand, the trade of otters is c

In [3]:
discovery_sample = pd.read_csv('../data/filtered_attrs.csv')
discovery_articles = {}
num_papers = 5
for url in discovery_sample['url'][:num_papers]:
    print('Buildling ', url)
    paper = newspaper.build('http://'+url, memoize_articles=False)
    discovery_articles[url] = paper

Buildling  100percentfedup.com
Buildling  1010wins.radio.com
Buildling  1011now.com
Buildling  10tv.com
Buildling  11alive.com


In [4]:
all_headlines = {}
num_articles = 3

for (url, paper) in discovery_articles.items():
    print(url)
    headlines = []
    for a in paper.articles[:num_articles]:
        headline = a.title
        if headline and len(headline) > 10:
            try:
                a.download()
                a.parse()
                a.nlp()
            except:
                print('summarize failed: ', a.url)
            headline = a.title
            keywords = a.keywords
            summary_newspaper = a.summary
            try:
                summary_spacy = summarize(a.text, 0.05)
            except: 
                print('Spacy failed: ', a.url)
                summary_spacy = ""
            if translator.detect(headline).lang != 'en':
                headline = translator.translate(headline.replace('\n', '').replace('\t', '')).text
                keywords = [kw.text for kw in translator.translate(keywords)]
                summary_newspaper = translator.translate(summary_newspaper).text
                summary_spacy = translator.translate(summary_spacy).text
            headlines.append((headline, a.url, a.top_image, keywords, summary_newspaper, summary_spacy))
    if len(headlines) > 0:
        all_headlines[url] = headlines

json_format = []
for (url, articles) in all_headlines.items():
    for (headline, link, images, keywords, summary_newspaper, summary_spacy) in articles:
        keywords = ', '.join([kw for kw in keywords if len(kw) > 4])
        entry = {'headline': headline, 'link': link,'image': images,'keywords': keywords,'summary_1':summary_newspaper, 'summary_2': summary_spacy,'base_link':url}
        json_format.append({'data':entry})

with open('sample_formatted.json', 'w') as fp:
    json.dump(json_format, fp)

100percentfedup.com
1010wins.radio.com
1011now.com
10tv.com
11alive.com


In [None]:
import lxml.html
from bs4 import BeautifulSoup
import urllib.request
from requests_html import AsyncHTMLSession 

bad_filetypes = ['.jpg', '.png']

def is_article(url, domain):
    not_file = all([url[-len(ex):] != ex for ex in bad_filetypes])
    return not_file and url.count('-') > 2 and (domain in url)

all_links = []
domain = 'foxnews.com/'
oururl= urllib.request.urlopen('https://www.' + domain).read()
soup = BeautifulSoup(oururl)
html = lxml.html.document_fromstring(str(soup))
html.make_links_absolute('https://www.' + domain)
for element, attribute, url, pos in html.iterlinks():
    if is_article(url, domain):
        all_links.append(url)

session = AsyncHTMLSession()
async_l = []
result = await session.get('https://www.' + domain)
await result.html.arender()
for url in result.html.absolute_links:
    if is_article(url, domain):
        all_links.append(url)

all_links = list(set(all_links))
len(all_links)