# Importing Libraries

In [1]:
# model that extracts topic lines by deep learning
from transformers import pipeline

# for web scraping
from bs4 import BeautifulSoup
import requests
import urllib
import re
from requests_html import HTML
from requests_html import HTMLSession

# language dectection
from langdetect import detect

# google translator api
import googletrans

# auto-posting for wordpress sites
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods import posts
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost, EditPost
from wordpress_xmlrpc.methods.users import GetUserInfo

# for start and end of loop condition and setting loop interval
import time
from datetime import datetime

#turning of ssl for auto-editing Wordpress
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import warnings
warnings.filterwarnings("ignore")

# Required Search Keyword and ID, Pass, Draft Title for Wordpress Site and Loop Interval

In [2]:
# provide search key word to scrape from google news
s_key = "desired search keywords here "

# wordpress site id and pass
wid = "your wordpress site id"
wpass = "your wordpress site password"

# Wordpress draft post's title
dft_title = "title of draft you want to edit automatically"

# setting loop end time and interval
end_time = 8  # 1 to 24 scale
interval_hr= 1 # desired interval in hour
interval = interval_hr * 3600

# Functions
### 1) Google Search Related

In [3]:
def check_eng(text):
    '''
    Check if an article element in the article list is written in English
    
    '''
    check_count = 0
    for l in text:
        try:
            if check_count == 0:
                lang = detect(l)
                check_count += 1
                return lang
        except:
            pass

In [4]:
def get_source(url):
    """ 
    argument: url to scrape.

    return:
        response (HTTP response from requests_html).
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [5]:
def scrape_google(query):
    '''
    Scrapes first page of the search results updated within 1 hour on google news, given
    search keyword (query).
    '''
    query = urllib.parse.quote_plus(query)
    # most recent (1hr) 'news' articles on google 
    response = get_source("https://www.google.com/search?q=" + query + '&newwindow=1&tbm=nws&sxsrf=APq-WBubXX6mz3DlY3jAI6sPYeISvwNsBw:1645516938257&source=lnt&tbs=qdr:h&sa=X&ved=2ahUKEwjujs6k7JL2AhUEDt4KHWzgBc4QpwV6BAgBEBQ&biw=958&bih=999&dpr=1')

    links = list(response.html.absolute_links)
    google_domains = ('https://www.google.', 
                      'https://google.', 
                      'https://webcache.googleusercontent.', 
                      'http://webcache.googleusercontent.', 
                      'https://policies.google.',
                      'https://support.google.',
                      'https://maps.google.')

    for url in links[:]:
        if url.startswith(google_domains):
            links.remove(url)

    return links

### 2) Text Cleaning and Chunking Text for Model

In [7]:
def get_clean(url):
    '''
    Clean the text from a scraped article from a site, then divide it into a 500 word chunk
    to feed it to the transformer summarization model.
    
    '''
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    results = soup.find_all(['h1','p'])
    text = [result.text for result in results]
    
    index_count = 0
    cleaned = []
    #initial (impossible) index for later exclusion of unrelated words such as contact info.
    ind = 1000000000000
    for c in text:
        # get index number to remove lines after 'contact info.'
        c_index = text.index(c)
        # get rid of ad. sentences with email addresses, phone numbers, contact info,
        # empty element in the article
        if (re.search('[a-z]\@[a-z]', c)) or (re.search('\+[0-9][0-9]', c)) or ('contact:' in c) or (c == ''):
            continue
        elif (re.search('For more information*', c)) or (re.search('for more information*', c)):
            # get the index where site's contact info. starts
            ind = c_index
            # remove(not append) lines having contact info.
        elif c_index > ind:
                continue     
        else:
            # get rid of \n in the article
            new_c = c.strip('\n')
            # get rid of '' mark within a sentence
            new_c = new_c.replace("''", "")
            # remove \xa0
            new_c = new_c.replace("\xa0", "")
            # append the striped list element to a new list
            cleaned.append(new_c)
        index_count += 1
    
    article = ' '.join(cleaned)
    
    article = article.replace('.', '.<eos>')
    article = article.replace('!', '!<eos>')
    article = article.replace('?', '?<eos>')
    sentences = article.split('<eos>')
    
    max_chunk = 500
    current_chunk = 0
    chunks = []

    for sentence in sentences:
        if len(chunks) == current_chunk +1:
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            chunks.append(sentence.split(' '))
            
    for chunk_id in range(len(chunks)):
        chunks[chunk_id] = ' '.join(chunks[chunk_id])
    
    return chunks

### 3) Auto-Draft-Editing on Wordpress Site

In [8]:
def edit_draft(new_content):
    client = Client('https://idknn.com/xmlrpc.php', wid, wpass)
    draft_posts = client.call(posts.GetPosts({'post_status': 'draft'}))
    for dp in draft_posts:
        if dp.title == dft_title:
            prev_content = dp.content
            dp.title = dft_title
            dp.content = prev_content + "\n\n" + new_content
            dp.thumbnail = dp.id
            client.call(posts.EditPost(dp.id,dp))

# Text Summarizer Model from Transformer

In [9]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


# Summary Generation and Auto-Draft_Editing Loop

In [None]:
# loop until end_time
while datetime.now().hour < end_time:
    
    # search keyword to search on google news
    links = scrape_google(s_key)

    # remove sites that block scraper.
    valid_links = []

    # get the list of sites that blocks scraper for the future use
    block_sites = []

    for link in links:

        # exclude nasdaq site for it makes bot down
        temp = link.split('.')
        if 'nasdaq' in temp:
            continue

        else:

            # try and see if there's a site blocking scraper
            try:
                r = requests.get(link)

                # for checking links that are not in English
                soup = BeautifulSoup(r.text, 'html.parser')
                results = soup.find_all(['h1','p'])
                text = [result.text for result in results]

                # for checking links that block scraper
                check = BeautifulSoup(r.text, 'html.parser').get_text(strip=True)
                temp = link.split('.')

                # remove links that contain error messages or non-topic-related text
                if (re.search("Error",check)) or (check == '') or (len(check) < 1500):
                    continue
                # filtering out articles not in English
                if check_eng(text) != 'en':
                    continue

                else:
                    valid_links.append(link)

            except:
                if link not in block_sites:
                    block_sites.append(link)

    # List of previous articles already got, to be excluded
    previous_articles = []
    # new articles to be worked with
    article_list = []

    for atc in valid_links:
        if atc not in previous_articles:
            previous_articles.append(atc)
            article_list.append(atc)

    # get summary sentences from each scraped valid articles
    articles_done = {}
    for url in article_list:
        chunks = get_clean(url)
        res = summarizer(chunks, max_length=150, min_length=10, do_sample=False)
        articles_done[url] = res[0]['summary_text']

    # translate each summary into Korean, key=site address, val=translated text
    # then auto-add it on a Wordpress site draft
    for k, v in articles_done.items():
        translator = googletrans.Translator() 
        result = translator.translate(articles_done[k], dest='ko') 
        translated = result.text
        new_content = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n" + "Original:  " + v + "\n\n" + "Translated:  " + translated + "\n\n" + '<a href="{}">{}</a>'.format(k,k) + "\n\n\n"
        edit_draft(new_content)
    
    time.sleep(interval)