In [None]:
from urllib import request
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from content import update_logger
from connector import select_id


def get_page_html(url, company=None):
    '''
    Attempts to get the content at 'url' by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the text content, otherwise return None.
    '''
    user_agent = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)' 
                  'AppleWebKit/537.36 (KHTML, like Gecko)' 
                  'Chrome/80.0.3987.163' 
                  'Safari/537.36 OPR/67.0.3575.137'
    )
    try:
        with closing(get(url, headers={'User-Agent':user_agent}, stream=True)) as resp:
            if is_good_response(resp):
                return BeautifulSoup(resp.content, 'html.parser')
            else:
                return None

    except RequestException as e:
        if company:
            log_error(f'Error during requests to {url} : {e}', url, company)
        return None


def is_good_response(resp):
    '''
    Returns True if the response seems to be HTML, False otherwise.
    '''
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)


def log_error(message, url, company):
    '''
    Log errors into database for further research.
    '''
    company_id = select_id('company', company)
    category_id = select_id('category', company)

    update_logger(company_id, category_id, 'Fail', message, url)


def download_image(url, path):
    '''
    Download image by 'url' to the 'path' catalog.
    '''
    opener = request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/57.0.2987.110 '
        'Safari/537.36')]
    request.install_opener(opener)
    try:
        request.urlretrieve(url, path)
        return True, 'Ok.'
    except Exception as e:
        return False, str(e)