In [9]:
# Imports
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium import webdriver
import pandas
import time

In [4]:
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [5]:
url_novel = "https://allnovelfull.com/the-beginning-after-the-end.html"

In [7]:
# Load Novel Page
driver.get(url_novel)

In [12]:
# Create Functions for scrapping data

def get_caps_in_page() -> list:
    """
    It returns a list of dictionaries, where each dictionary has a key "chap_name" and a key "chap_url".
    :return: A list of Dict
    """
    list_li_in_pag = driver.find_elements(
        By.XPATH,
        '//*[@id="list-chapter"]/div[@class="row"]/div/ul[@class="list-chapter"]/li/a'
    )
    return [
        {"chap_name": li.text, "chap_url": li.get_attribute("href")}
        for li in list_li_in_pag
    ]

def get_next_page(list_li: list) -> list:
    """
    It will keep clicking the next page button until it can't find it anymore, and then return the list
    of all the links it found

    :param list_lis: This is the list of all the chapters
    :return: A list of tuples.
    """
    try:
        list_li = [*list_li, *get_caps_in_page()]
        time.sleep(2)
        next_list = driver.find_element(
            By.XPATH, '//*[@id="list-chapter"]/ul[1]/li[10]/a'
        )
        next_list.click()
        return get_next_page(list_li)
    except NoSuchElementException:
        print("Last Page of chapt")
        return list_li

def get_cap_content() -> dict:
    """
    It will wait for 2 seconds, then it will find the element with the id "chapter-content" and the
    element with the class "chapter-title" and return a dictionary with the title and the content of the
    chapter
    :return: A dictionary with the title and content of the chapter.
    """
    time.sleep(2)
    content = driver.find_element(By.XPATH, '//*[@id="chapter-content"]')
    title = driver.find_element(By.XPATH, '//*[@class="chapter-title"]').text
    return {"cap_title": title, "content_cap": content.text}

def next_caps(list_content: list) -> None:
    """
    If the next button is not disabled, click it and call the function again.

    :param list_content: list = []
    :type list_content: list
    :return: A list of strings.
    """
    try:
        content = get_cap_content()
        list_content.append(content)
        next_cap = driver.find_element(By.XPATH, '//*[@id="next_chap"]')
        if not next_cap.get_attribute("disabled"):
            next_cap.click()
            return next_caps(list_content)
    except Exception:
        next_caps(list_content)

In [13]:
list_chapter = get_next_page([])

Last Page of chapt


In [16]:
with open("./Data/list_of_caps_url.json", "w") as f:
    json.dump(list_chapter, f, indent=4)

In [18]:
list_content = []

# Start chapter for scraping
driver.get(list_chapter[0]["chap_url"])

In [19]:
# Recursive funtion for read content int the all chapters of novel
next_caps(list_content)

In [22]:
with open("./Data/list_with_caps_content.json", "w") as f:
    json.dump(list_content, f, indent=4)