In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
## from mathematicians import simple_get
##raw_html = simple_get('https://realpython.com/blog/')
##len(raw_html)


##no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it')
##no_html is None
##True

603251

In [8]:
raw_html = simple_get('https://libgen.is/')
len(raw_html)


46196

In [9]:
 raw_html = simple_get('https://libgen.is/')
 html = BeautifulSoup(raw_html, 'html.parser')
 for i, li in enumerate(html.select('li')):
        print(i, li.text)

0 RU

1 FORUM


Sitemap
Error report



2 DOWNLOAD


Mirrors
Gen.lib.rus.ec
Libgen.lc
Libgen.pw
Z-Library
BookFI.net


P2P
Torrents
Usenet (*.nzb)
Database Dumps
gen.lib.rus.ec
libgen.lc


Other
Books catalog (XLS)
Source (PHP)
Import local files in LG format
Libgen Desktop application



3 UPLOAD



Libgen uploader
Fiction uploader
Scientific articles uploader
FTP

						(Login:password look at the forum sitemap)
					



4 LAST



Last added
Last modified
RSS
API




5 OTHERS



Comics
Fiction
Magazines
Standarts
Full-text search in LG content




6 TOPICS



Technology



Aerospace Equipment
Automation
Communication: Telecommunications
Communication
Construction
Construction: Cement Industry
Construction: Renovation and interior design: Saunas
Construction: Renovation and interior design


Construction: Ventilation and Air Conditioning
Electronics: Electronics
Electronics: Fiber Optics
Electronics: Hardware
Electronics: Home Electronics
Electronics: Microprocessor Technology
Electro

In [15]:
raw_html_1 = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
len(raw_html_1)

414159

In [16]:
 raw_html_1 = simple_get('https://libgen.is/')
 html = BeautifulSoup(raw_html_1, 'html.parser')
 for i, li in enumerate(html.select('li')):
        print(i, li.text)

0 RU

1 FORUM


Sitemap
Error report



2 DOWNLOAD


Mirrors
Gen.lib.rus.ec
Libgen.lc
Libgen.pw
Z-Library
BookFI.net


P2P
Torrents
Usenet (*.nzb)
Database Dumps
gen.lib.rus.ec
libgen.lc


Other
Books catalog (XLS)
Source (PHP)
Import local files in LG format
Libgen Desktop application



3 UPLOAD



Libgen uploader
Fiction uploader
Scientific articles uploader
FTP

						(Login:password look at the forum sitemap)
					



4 LAST



Last added
Last modified
RSS
API




5 OTHERS



Comics
Fiction
Magazines
Standarts
Full-text search in LG content




6 TOPICS



Technology



Aerospace Equipment
Automation
Communication: Telecommunications
Communication
Construction
Construction: Cement Industry
Construction: Renovation and interior design: Saunas
Construction: Renovation and interior design


Construction: Ventilation and Air Conditioning
Electronics: Electronics
Electronics: Fiber Optics
Electronics: Hardware
Electronics: Home Electronics
Electronics: Microprocessor Technology
Electro

In [19]:
def get_names():
    """
    Downloads the page where the list of mathematicians is found
    and returns a list of strings, one per mathematician
    """
    url = 'http://www.fabpedigree.com/james/mathmen.htm'
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select('li'):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))

In [20]:
def get_hits_on_name(name):
    """
    Accepts a `name` of a mathematician and returns the number
    of hits that mathematician's Wikipedia page received in the 
    last 60 days, as an `int`
    """
    # url_root is a template string that is used to build a URL.
    url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE'
    response = simple_get(url_root.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [a for a in html.select('a')
                    if a['href'].find('latest-60') > -1]

        if len(hit_link) > 0:
            # Strip commas
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return None

In [21]:
if __name__ == '__main__':
    print('Getting the list of names....')
    names = get_names()
    print('... done.\n')

    results = []

    print('Getting stats for each name....')

    for name in names:
        try:
            hits = get_hits_on_name(name)
            if hits is None:
                hits = -1
            results.append((hits, name))
        except:
            results.append((-1, name))
            log_error('error encountered while processing '
                      '{}, skipping'.format(name))

    print('... done.\n')

    results.sort()
    results.reverse()

    if len(results) > 5:
        top_marks = results[:5]
    else:
        top_marks = results

    print('\nThe most popular mathematicians are:\n')
    for (mark, mathematician) in top_marks:
        print('{} with {} pageviews'.format(mathematician, mark))

    no_results = len([res for res in results if res[0] == -1])
    print('\nBut we did not find results for '
          '{} mathematicians on the list'.format(no_results))

Getting the list of names....
... done.

Getting stats for each name....
Error during requests to URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE : Invalid URL 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE': No schema supplied. Perhaps you meant http://URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE?
No pageviews found for Charles Hermite
Error during requests to URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE : Invalid URL 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE': No schema supplied. Perhaps you meant http://URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE?
No pageviews found for Joseph Liouville
Error during requests to URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE : Invalid URL 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE': No schema supplied. Perhaps you meant http://URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE?
No pageviews found for Stefan Banach
Error during requests to URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE : Invalid URL 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE': No schema supplied. Perhaps you 

# <!DOCTYPE html>
<html>
<head>
  <title>Contrived Example</title>
</head>
<body>
<p id="eggman"> I am the egg man </p>
<p id="walrus"> I am the walrus </p>
</body>
</html>