In [12]:
import json
import time
import logging

# import wandb
import requests
import progressbar
import numpy as np 
import pandas as pd
from bs4 import BeautifulSoup
from googletrans import Translator

In [13]:
def detect_langs(translator, texts) -> list:
    widgets = [
        'Processing pages: ', progressbar.Percentage(),
        ' ', progressbar.Bar(), ' ', progressbar.ETA(),
        ' | ', progressbar.Counter(), ' Processed'
    ]
    pbar = progressbar.ProgressBar(maxval=len(titles), widgets=widgets).start()

    detected_langs = []
    for i, text in enumerate(texts):
        detected_langs.append(translator.detect(text))
        time.sleep(0.5)
        pbar.update(i)

    pbar.finish()

    langs = [lang.lang for lang in detected_langs]
    
    return langs


def translate_title(translator, titles, langs, dest="en", src="pt") -> list:
    widgets = [
        'Processing pages: ', progressbar.Percentage(),
        ' ', progressbar.Bar(), ' ', progressbar.ETA(), 
        ' | ', progressbar.Counter(), ' Processed'
    ]
    pbar = progressbar.ProgressBar(maxval=len(titles), widgets=widgets).start()

    titles_translated = []
    for i, lang, title in zip(range(len(titles)), langs, titles):
        if lang == "en":
            titles_translated.append(title)
        if lang == "pt":
            titles_translated.append(translator.translate(title, dest=dest, src=src).text)
            time.sleep(1)
        pbar.update(i)

    pbar.finish()
    
    return titles_translated


def translate_keywords(translator, keywords, langs, dest="en", src="pt") -> list:
    widgets = [
        'Processing pages: ', progressbar.Percentage(),
        ' ', progressbar.Bar(), ' ', progressbar.ETA(),
        ' | ', progressbar.Counter(), ' Processed'
    ]
    pbar = progressbar.ProgressBar(maxval=len(keywords), widgets=widgets).start()

    keywords_en = []
    for i, lang, keywords_list in zip(range(len(keywords)), langs, keywords):
        if lang == "en":
            keywords_en.append(", ".join(keywords_list))
        if lang == "pt":
            keywords_en.append(translator.translate(", ".join(keywords_list), dest=dest, src=src).text)
            time.sleep(2)
        pbar.update(i)

    pbar.finish()
    
    return keywords_en

In [14]:
PBAR_WIDGETS = ["Processing pages: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
LANDING_PAGE_URL = 'https://www.sba.org.br/open_journal_systems/index.php/cba'

# Make a GET request to fetch the raw HTML content
html_content = requests.get(LANDING_PAGE_URL).text
landing_page = BeautifulSoup(html_content, 'html.parser')

paper_pages = []
obj_article = landing_page.body.find_all("div", attrs={"class": "obj_article_summary"})
for obj in obj_article:
    paper_pages.append(obj.find("div", attrs={"class": "title"}).a["href"])
    
paper_pages = paper_pages[3:]

## About the parallelization of the ```URL GET``` requests

### TL;DR
Each request was made by a thread. The thread number was choosen to be 50.

### Detailed
This solution was heavily inspired by the code found on the link bellow

https://www.shanelynn.ie/using-python-threading-for-multiple-results-queue/

The ideia is basically to used threads to generate the requests (knowing that we don't have a real thread parallelism in python, but what seems to be one, given that all is executed concurrently). But then we encounter the basic problems of the parallelism universe: 

1. __Are we going to use a shared variable?__
   
    R: Yes, we are. But since the parallelism idea comes to the fact that there is a list of URLs where each request of one URL of it does not interfere on the request of another, the we can simply create a NEW array of the same size of that one, populate it with empty values (or strings or empty arrays) and then populate it on each request given that we pass an index of the URL on the first array to that new one. If it sounded confused, the idea is: the preprocessed data of the URL that occupies a given index on the first array will be at the same index of the second one. Just like that.

2. __How can we choose an adequate number of threads to solve this problem without overflowing our RAM memory or the native python's thread creation limit (the python has a quite obscure limit number of threads that it can create)?__

    R: This is purely arbitrary :) I just let the number used in the tutorial as it was. It seemed not so big and not so small.

3. __How each thread will know the number of URLs it can process requests (since some threads can finish faster than others and so on)?__

    R: For this we'll use the multithreading data structure ```Queue()``` from the ```queue``` library. We populate it if pairs (index, url_correspondent_that_index) of the original array. And on each thread, we simply verify if the queue is not empty. If that's the case, we can pop an element of it (that will be reflected to the other threads) and make a request with it. Since we have also the index of the original element on the original array, it is as described on item __1)__ above.



In [15]:
import traceback
from queue import Queue
from threading import Thread

# number of URL pages
n_pages = len(paper_pages)
# a queue for threading with "infinite" size
url_queue = Queue(maxsize=0)
# NUM_THREADS is a number in [1, 50]
NUM_THREADS = min(50, n_pages)

# the shared variable must be outside the threads' scope, then we instantiate it here.
# each one is a array of arrays or strings with n_pages elements.
results = {
    'titles':       ['']    * n_pages,
    'authors':      [ [] ]  * n_pages,
    'affiliations': [ [] ]  * n_pages,
    'dois':         [ [] ]  * n_pages,
    'keywords':     [ [] ]  * n_pages,
    'abstracts':    [ [] ]  * n_pages
}

# starting the Queue threading structure and populating it with indexes and the 
# URLS that will be used by the thread workers.
for i, url in enumerate(paper_pages):
    url_pair = (i, url)
    url_queue.put(url_pair)


def scrap_url(queue: Queue, results: dict, pbar: progressbar.bar.ProgressBar) -> None:
    while not queue.empty():
        INDEX, URL = queue.get()

        try:
            # flag that indicates if the request was well succeeded.
            proceed = False
            while not proceed:
                # this block is what guarantee that failed requests execute again
                try:
                    html_content = requests.get(URL, timeout=15).text
                    page = BeautifulSoup(html_content, "html.parser")
                except Exception as err:
                    # all exceptions here are indiscriminatly treated the same
                    # way: ignoring them, since the only thing possible to do
                    # is to try the GET request again.
                    pass
                else:
                    if page.body is not None:
                        proceed = True

            # get paper title
            current_title = page.find("h1", class_="page_title").text.strip()
            results['titles'][INDEX] = current_title

            # get author names
            author_list = page.body.find("article", attrs={"class": "obj_article_details"}).\
            find_all("span", attrs={"class": "name"})

            current_authors_list = []
            for author_span in author_list:
                current_authors_list.append(author_span.text.strip())

            results['authors'][INDEX] = current_authors_list

            # get author affiliations
            affiliations_list = page.body.find("article", attrs={"class": "obj_article_details"}).\
            find_all("span", attrs={"class": "affiliation"})

            current_affiliations_list = []
            for affiliation_span in affiliations_list:
                current_affiliations_list.append(affiliation_span.text.strip())

            results['affiliations'][INDEX] = current_affiliations_list

            # get doi
            current_doi = page.body.find("article", attrs={"class": "obj_article_details"}).\
            find("div", class_="item doi").a.string.strip()
            results['dois'][INDEX] = [current_doi]

            # get keywords
            current_keywords = page.body.find("article", attrs={"class": "obj_article_details"}).\
            find("div", class_="item keywords").find("span", attrs={"class": "value"}).text.split(",")
            current_keywords = [keyword.strip() for keyword in current_keywords]
            results['keywords'][INDEX] = current_keywords

            # get abstract
            current_abstract = [page.find("div", class_="item abstract").p.text.strip()]
            results['abstracts'][INDEX] = current_abstract
        # just in case of any error at the try, just to keep the downloading the
        # pipeline safe.
        except Exception as err:
            logger.warning(f'{err} (at `results` on index {INDEX}) error when incorporating elements of {URL}')

            # if an error occurs, then we have put the same pair (INDEX, URL)
            # to the queue otherwise, it can have an incomplete set of values
            url_pair = (INDEX, URL)
            url_queue.put(url_pair)
        else:
            pbar.update(n_pages - queue.qsize())

        # sinalizes to the queue that the job is done (important to the .join() )
        queue.task_done()

In [16]:
# defining the progress bar
pbar = progressbar.ProgressBar(maxval=n_pages, widgets=PBAR_WIDGETS).start()

for i in range(NUM_THREADS):
    # starting the threads with the queue and the shared variable of results
    worker = Thread(target=scrap_url, args=(url_queue, results, pbar))
    worker.setDaemon(True)    # setting threads as "daemon" allows main program to 
                              # exit eventually even if these dont finish 
                              # correctly.
    worker.start()

url_queue.join()
pbar.finish()

print('All tasks concluded.')

Processing pages: 100% |#######################################| Time:  0:01:02


All tasks concluded.


In [17]:
# segmenting the results lists
titles = results['titles']
authors = results['authors']
affiliations = results['affiliations']
dois = results['dois']
keywords = results['keywords']
abstracts = results['abstracts']

In [32]:
with open('titles.json', 'wt') as save_path:
    json.dump(titles, save_path, indent=2)

with open('titles.json', 'rt') as read_path:
    AIAI = json.load(read_path)

In [18]:
# with open("titles.json", "r") as f:
#     titles = json.load(f)

# with open("authors.json", "r") as f:
#     authors = json.load(f)
    
# with open("affiliations.json", "r") as f:
#     affiliations = json.load(f)
    
# with open("dois.json", "r") as f:
#     dois = json.load(f)
    
# with open("keywords.json", "r") as f:
#     keywords = json.load(f)
    
# with open("abstracts.json", "r") as f:
#     abstracts = json.load(f)
    
# with open("langs.json", "r") as f:
#     langs = json.load(f)
    
# with open("titles_en.json", "r") as f:
#     titles_en = json.load(f)

In [19]:
translator = Translator()

In [20]:

langs = detect_langs(translator, titles)
titles_en = translate_title(translator, titles, langs)
keywords_en = translate_keywords(translator, keywords, langs)

Processing pages: 100% |#######################| Time:  0:10:29 | 777 Processed
Processing pages: 100% |#######################| Time:  0:13:30 | 777 Processed
Processing pages: 100% |#######################| Time:  0:24:30 | 777 Processed
