In [1]:
import os
import time
import requests
from urllib import parse
from ebooklib import epub
from bs4 import BeautifulSoup

### TODO:
- Get and add author
    - Add author to filename
- Get and add synopsis
- Get and add metadata from FF (not same as epub)
    - Rating, genre, words, etc
- Add TOC

In [2]:
base_url = "https://www.fanfiction.net/"
rel_url = "book/Worm/"

In [3]:
start_url = parse.urljoin(base_url, rel_url)
#soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
def get_soup(url):
    page = requests.get(url)
    return BeautifulSoup(page.content, 'lxml')

In [5]:
def get_book_links_on_page(page_soup):
    #results = soup.find_all('div', class_='z-list zhover zpointer')
    links = page_soup.find_all('a', class_='stitle')
    
    books_on_page = {}
    for l in links:
        books_on_page[l.get('href')] = l.text
    return books_on_page

In [6]:
def get_next_link(base_url, page_soup):
    try:
        next_link_rel = page_soup.find(lambda tag:tag.name=="a" and "Next" in tag.text).get('href')
    except AttributeError:
        return None
    return next_link_rel

In [7]:
def get_book_links(base_url, url, books):
    soup = get_soup(parse.urljoin(base_url, url))
    
    books.update(get_book_links_on_page(soup))

    next_link = get_next_link(base_url, soup)
    if next_link is not None:
        print(next_link)
        get_book_links(base_url, next_link, books)
    else:
        print("Done")
        return books

In [8]:
all_books = {}
get_book_links(base_url, start_url, all_books)

/book/Worm/?&srt=1&r=103&p=2
/book/Worm/?&srt=1&r=103&p=3
/book/Worm/?&srt=1&r=103&p=4
/book/Worm/?&srt=1&r=103&p=5
/book/Worm/?&srt=1&r=103&p=6
/book/Worm/?&srt=1&r=103&p=7
/book/Worm/?&srt=1&r=103&p=8
/book/Worm/?&srt=1&r=103&p=9
/book/Worm/?&srt=1&r=103&p=10
/book/Worm/?&srt=1&r=103&p=11
/book/Worm/?&srt=1&r=103&p=12
/book/Worm/?&srt=1&r=103&p=13
/book/Worm/?&srt=1&r=103&p=14
/book/Worm/?&srt=1&r=103&p=15
/book/Worm/?&srt=1&r=103&p=16
/book/Worm/?&srt=1&r=103&p=17
/book/Worm/?&srt=1&r=103&p=18
/book/Worm/?&srt=1&r=103&p=19
/book/Worm/?&srt=1&r=103&p=20
/book/Worm/?&srt=1&r=103&p=21
/book/Worm/?&srt=1&r=103&p=22
/book/Worm/?&srt=1&r=103&p=23
/book/Worm/?&srt=1&r=103&p=24
/book/Worm/?&srt=1&r=103&p=25
Done


In [9]:
print(f"Found {len(all_books)} books")

Found 622 books


In [10]:
def get_next_button(chapter_soup):
    next_button = chapter_soup.find(lambda tag:tag.name=="button" and "Next" in tag.text)
    try:
        onclick = next_button.get('onclick')
    except AttributeError:
        return None
    return onclick.split("'")[1].split("'")[0]

In [11]:
def get_chapter_titles(chapter_soup):
    titles = []
    select = chapter_soup.find('select', id='chap_select')
    
    try:
        selected = select.find('option', selected=True)
    except AttributeError:
        return None
    return selected.text.split('.')[1][1:]

In [16]:
def get_book(base_url, chapter_url, chapters, title, wait=10):
    soup = get_soup(parse.urljoin(base_url, chapter_url))
    main_soup = soup.find('div', id='storytext')
    
    next_chapter_link = get_next_button(soup)
    chapter_title = get_chapter_titles(soup)
    if chapter_title is None:
        chapter_title = title

    print(f"\tChapter: {chapter_title}")
    chapters.append( (chapter_title, main_soup) )
    
    if next_chapter_link is not None:
        time.sleep(wait)
        get_book(base_url, next_chapter_link, chapters, title, wait)
    else:
        return chapters

In [17]:
def create_epub(title, chapters, out_file):
    book = epub.EpubBook()
    
    book.set_identifier('an_id')
    book.set_title(title)
    book.set_language('en')
    book.add_author('NYI')
    
    #print(chapters[0][1].prettify())
    
    epub_chapters = []
    epub_xhtml = []
    
    #intro = epub.EpubHtml(title="something", file_name="intro.xhtml", lang='en')
    #intro.content = u'<h1>Intro heading</h1><p>Zaba je skocila u baru.</p>'
    
    for counter, chapter in enumerate(chapters):
        # create chapter
        chapter_xhtml = f"chap_{counter}.xhtml"
        c = epub.EpubHtml(title=chapter[0], file_name=chapter_xhtml, lang='en')
        c.content = str(chapter[1])

        epub_chapters.append(c)
        epub_xhtml.append(chapter_xhtml)
        
        # add chapter
        book.add_item(c)
    
    # define Table Of Contents
    #book.toc = (epub.Link(epub_xhtml[0], chapter[0], 'intro'),
    #             (
    #                 epub.Section('Simple book'),
    #                 (epub_chapters )
    #             )
    #           )
    
    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    
    # define CSS style
    style = 'BODY {color: white;}'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

    # add CSS file
    book.add_item(nav_css)

    # basic spine
    book.spine = ['nav', *epub_chapters]

    # write to the file
    epub.write_epub(out_file, book, {})

In [18]:
def get_all_books(base_url, book_dir, wait=30):
    for url, title in book_dir.items():
        chapters = []
        print(f"Getting: {title}")
        get_book(base_url, url, chapters, title)
        print("\tCreating Epub")
        create_epub(title, chapters, f"c:\\temp\\{title}.epub")
        
        print(f"\tBook done, waiting {wait} seconds")
        time.sleep(wait)
        print("")
        
    print("Done")

In [19]:
get_all_books(base_url, all_books)

Getting: Video Gaming through Life
	Chapter: Buffering 1-1
	Chapter: Buffering 1-2
	Chapter: Buffering 1-3
	Chapter: Buffering 1-4
	Chapter: Buffering 1-5
	Chapter: Buffering 1-6
	Creating Epub
	Book done, waiting 30 seconds

Getting: Of Metal, Bone, and Claws
	Chapter: Proluge: PHO
	Chapter: Chapter 1-1: Sophia Hess
	Chapter: Chapter 1-2: Sophia Hess
	Chapter: Chapter 1-3: Sophia Hess
	Chapter: Chapter 1-4: Sophia Hess
	Chapter: Interlude: Glory Girl
	Chapter: Chapter 2-1: Annette Hebert
	Chapter: Chapter 2-2: Annette Hebert
	Chapter: Chapter 2-3: Annette Hebert
	Chapter: Chapter 2-4: Annette Hebert
	Chapter: Interlude: Maximilian Anders
	Chapter: Chapter 3-1: Miss Militia
	Chapter: Chapter 3-2: Miss Militia
	Chapter: Chapter 3-3: Miss Militia
	Chapter: Chapter 3-4: Miss Militia
	Chapter: Interlude: Marquis
	Chapter: Chapter 4-1: Thomas Calvert
	Creating Epub
	Book done, waiting 30 seconds

Getting: Wyvern
	Chapter: Chapter 1
	Chapter: Chapter 2
	Chapter: Chapter 3
	Chapter: Chapter 4

KeyboardInterrupt: 