## http://www.loyalbooks.com/language-menu

In [1]:
from bs4 import BeautifulSoup
from urllib import urlopen
import os
import requests, zipfile, StringIO
import shutil

In [2]:
def create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)

### Parse main page and get all languages

In [3]:
def load_page_soup(url):
    html = urlopen(url).read()
    soup = BeautifulSoup(html, 'lxml')
    return soup

In [4]:
BASE = u'http://www.loyalbooks.com/'
LANGUAGES_PAGE = BASE + 'language-menu'

In [5]:
soup = load_page_soup(LANGUAGES_PAGE)
table = soup.find_all("table", {"class": "link"})[0]
links = table.find_all('a')

In [6]:
class LanguageFolder(object):
    def __init__(self, uid, href, name):
        self.uid = uid
        self.href = href
        self.name = name

In [7]:
def get_languages_folders():
    language_by_name = dict()

    for a in links:
        uid = a.find_all('div')[0]['id']
        lang_folder = LanguageFolder(uid, a['href'], a.text)
        language_by_name[a.text] = lang_folder

    return language_by_name

In [8]:
language_by_name = get_languages_folders()
print len(language_by_name)

29


### Parse each language page and get specified amount of books

In [9]:
class BookLink(object):
    def __init__(self, url, lang_folder):
        self.url = url
        self.lang_folder = lang_folder

In [10]:
def get_table_books_urls(soup_table, lang_folder, max_count=3):
    rows = soup_table.find_all("tr")
    book_links = []
    
    for row in rows:
        if len(book_links) >= max_count:
            break

        cells = row.find_all('td')
        unique_urls = set()
        
        if not cells:
            continue
        
        for cell in cells:
            if cell.has_attr('colspan'):
                continue
            
            links = cell.find_all('a')
            if not links:
                continue

            for link in links:
                unique_urls.add(link['href'])
        
        for href in unique_urls:
            if len(book_links) >= max_count:
                break
            book_links.append(BookLink(href, lang_folder))

    return book_links


def get_language_books_links(lang_folder, max_count=3):
    language_books_url = BASE + lang_folder.href
    lang_soup = load_page_soup(language_books_url)
    
    tables = lang_soup.find_all("table", {"summary": "Audio books", "class": 'layout2-blue'})
    
    if len(tables) > 1:
        raise Exception(u'Found more than one table "{}"'.format(language_books_url))

    return get_table_books_urls(tables[0], lang_folder, max_count=max_count)

In [11]:
exclude = ['Multilingual', 'Old English', 'Church Slavonic']
books_by_language = dict()
max_books = 5

for name, lang_folder in language_by_name.iteritems():
    if name in exclude:
        continue

    books_links = get_language_books_links(lang_folder, max_count=max_books)
    books_by_language[name] = books_links

print len(books_by_language)

26


### Download languages collected books

In [12]:
FOLDER_TO_SAVE_ZIPS = '/home/kolegor/Downloads/audiobooks/zips/'
FOLDER_TO_SAVE_MP3 = '/home/kolegor/Downloads/audiobooks/mp3/'
ZIP_LANGUAGE_FOLDER_TEMPLATE = FOLDER_TO_SAVE_ZIPS + '{language}/'
MP3_LANGUAGE_FOLDER_TEMPLATE = FOLDER_TO_SAVE_MP3 + '{language}/'

In [13]:
def load_and_save_zip(url, save_folder):
    r = requests.get(url, stream=True)

    name = url.split('/')[-1]
    zip_path = save_folder + name

    if not os.path.exists(zip_path):
        shutil.copyfileobj(r.raw, open(zip_path, "wb"))    

    return zipfile.ZipFile(zip_path)

In [13]:
def unpack_book_zip(zip_file, unpack_to):
    count_unpacked = 0

    for name in zip_file.namelist():
        if name.endswith('/'):
            continue
        if not name.endswith('.mp3') and not name.endswith('.wav'):
            continue

        filename = name.replace('/', '__')
        full_path = unpack_to + filename

        content = zip_file.open(name).read()
        with open(full_path, 'w') as outf:
            outf.write(content)     
        
        count_unpacked += 1
    
    return count_unpacked


def get_book_download_url(book_url):
    book_soup = load_page_soup(book_url)
    table = book_soup.find_all('table', {'summary': 'Audiobook downloads'})[0]
    all_download_links = table.find_all('a', {'class': 'download'})
    
    zip_link = None

    for link in all_download_links:
        if link['href'].endswith('.zip'):
            zip_link = link['href']
    
    return zip_link


def download_language_books(lang_folder, collected_books):
    zip_folder = ZIP_LANGUAGE_FOLDER_TEMPLATE.format(language=lang_folder.name)
    create_folder(zip_folder)

    mp3_folder = MP3_LANGUAGE_FOLDER_TEMPLATE.format(language=lang_folder.name)
    create_folder(mp3_folder)
    
    count_loaded_books = 0
    count_loaded_mp3 = 0

    for i, book in enumerate(collected_books):
        book_url = BASE + book.url
        
        try:
            download_link = get_book_download_url(book_url)
        except:
            print u' !!! EXCEPTION ON PAGE LOAD {}'.format(book_url)
            continue
        
        if download_link is None:
            continue

        print '\n - Loading book {}/{} ({})'.format(
            i + 1,
            len(collected_books),
            download_link.replace('http://www.archive.org/download/', '')
        )
        zip_file = load_and_save_zip(download_link, zip_folder)

        print u' - Unpacking to {}'.format(mp3_folder)
        count_files = unpack_book_zip(zip_file, mp3_folder)
        print u' - Unpacked {} files'.format(count_files)
        
        count_loaded_books += 1
        count_loaded_mp3 += count_files

    return count_loaded_books, count_loaded_mp3

In [7]:
def run_download():
    total_books = 0
    total_mp3s = 0

    already_done = [
        'Swedish',
        'Tagalog',
        'Javanese',
        'Ancient Greek',
        'Dutch',
        'Danish',
        'Bulgarian',
        'Latin',
        'Hungarian',
        'French',
        'Russian',
        'Tamil',
        'Finnish',
        'Hebrew',
        'Greek',
        'Latvian',
        'English',
        'Italian',
        'Portuguese',
        'Chinese',
    ]

    for i, (lang_name, collected_books) in enumerate(books_by_language.iteritems()):
        print '\n {}/{} {}'.format(i + 1, len(books_by_language), lang_name)

        if lang_name in already_done:
            print ' - Skip'
            continue

        for book in collected_books:
            print book.url

        # lang_folder = language_by_name[lang_name]
        # loaded_books, loaded_mp3s = download_language_books(lang_folder, collected_books)

        # total_books += loaded_books
        # total_mp3s += loaded_mp3s

    print u'\n\nTotal books: {}'.format(total_books)
    print u'Total mp3s: {}'.format(total_mp3s)

### Only unpack

In [5]:
def unpack_zips(zips_folder, mp3s_folder):
    for folder_name in os.listdir(zips_folder):
        print folder_name

        in_mp3_folder = mp3s_folder + folder_name + '/'
        in_zip_folder = zips_folder + folder_name + '/'

        create_folder(in_mp3_folder)

        for zip_name in os.listdir(in_zip_folder):
            full_zip_path = in_zip_folder + zip_name
            zip_file = zipfile.ZipFile(full_zip_path)

            print ' -', zip_name
            unpack_book_zip(zip_file, in_mp3_folder)

In [6]:
zips_folder = '/home/kolegor/Downloads/audiobooks/zips/'
mp3s_folder = '/home/kolegor/Downloads/audiobooks/mp3/'

# unpack_zips(zips_folder, mp3s_folder)