## Series Data

In [13]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_titles(search_url):
    response = requests.get(search_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all elements with class "fiction-title"
    fiction_titles = soup.find_all('h2', class_='fiction-title')

    # Extract titles and their URLs
    titles_data = []
    for title_element in fiction_titles:
        title = title_element.text.strip()
        url = title_element.find('a')['href']
        titles_data.append({"title": title, "url": url})

    return titles_data

def save_to_json(data, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=2)

def main():
    search_url = "https://www.royalroad.com/fictions/search?title=series"
    titles_data = scrape_titles(search_url)

    # Save titles data to JSON
    output_file = "titles_urls.json"
    save_to_json(titles_data, output_file)

    # Print or save the titles and URLs
    for data in titles_data:
        print(f"Title: {data['title']}")
        print(f"URL: {data['url']}")
        print()

if __name__ == "__main__":
    main()


Title: Echoes of the Tribulation: An Historical Apocalypse LitRPG Series.
URL: /fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse

Title: Awakening Horde: Shieldwall Academy Series
URL: /fiction/58482/awakening-horde-shieldwall-academy-series

Title: The Devil in White: An Awakened Aspirations Online Series
URL: /fiction/26488/the-devil-in-white-an-awakened-aspirations-online

Title: Last Day Of The Human. Dragon Heart (A LitRPG Wuxia) Series: Book 20
URL: /fiction/22034/last-day-of-the-human-dragon-heart-a-litrpg-wuxia

Title: The Jade Phoenix Saga (A Cultivation LitRPG Series)
URL: /fiction/38785/the-jade-phoenix-saga-a-cultivation-litrpg-series

Title: The Merchants of Blight [Apocalyptic LitRPG Series]
URL: /fiction/74736/the-merchants-of-blight-apocalyptic-litrpg-series

Title: The System Envoy: A SciFi 4X LitRPG Series
URL: /fiction/60127/the-system-envoy-a-scifi-4x-litrpg-series

Title: Greyblood: Progenitor (A LitRPG Series)
URL: /fiction/21497/greyblood-progenit

In [3]:
import requests
from bs4 import BeautifulSoup
import json

url = "https://www.royalroad.com/fictions/search?title=series"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract data using the specified path for followers
    follower_elements = soup.select("div.col-sm-6.uppercase.bold.font-blue-dark")
    
    # Extract data using the specified path for ratings
    rating_elements = soup.select("div.col-sm-6.uppercase.bold.font-blue-dark[aria-label^='Rating']")

    # Create lists to store the extracted data
    extracted_followers_data = []
    extracted_ratings_data = []

    # Extract followers' data
    for element in follower_elements:
        extracted_followers_data.append(element.get_text(strip=True))

    # Extract ratings' data
    for rating_element in rating_elements:
        extracted_ratings_data.append(rating_element['aria-label'].replace('Rating: ', ''))

    # Organize the data for each follower and rating together
    organized_data = []
    for i in range(0, len(extracted_followers_data), 6):
        follower_data = {
            "Followers": extracted_followers_data[i],
            "Pages": extracted_followers_data[i + 2],
            "Views": extracted_followers_data[i + 3],
            "Chapters": extracted_followers_data[i + 4],
            "Date": extracted_followers_data[i + 5],
            "Rating": extracted_ratings_data[i // 6],  # Use integer division to match ratings to followers
        }
        
        organized_data.append(follower_data)

    # Save the organized data to a JSON file
    with open("followers_with_ratings.json", "w", encoding="utf-8") as json_file:
        json.dump(organized_data, json_file, ensure_ascii=False, indent=2)

    print("Data has been successfully extracted and saved to 'followers_with_ratings.json'.")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


Data has been successfully extracted and saved to 'followers_with_ratings.json'.


In [4]:
import requests
from bs4 import BeautifulSoup
import json

url = "https://www.royalroad.com/fictions/search?title=series" 

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract all anchor tags with class "fiction-tag"
    all_tags = soup.find_all("a", class_="label label-default label-sm bg-blue-dark fiction-tag")

    # Extract tag text from anchor tags
    tags = [tag.text.strip() for tag in all_tags]

    # Print the tags
    print("Content Tags:")
    for tag in tags:
        print(tag)

    # Create a dictionary to store the extracted data
    extracted_data = {
        "Tags": tags,
    }

    # Save the extracted data to a JSON file
    with open("content_tags.json", "w", encoding="utf-8") as json_file:
        json.dump(extracted_data, json_file, ensure_ascii=False, indent=2)

    print("Content tags have been successfully printed and saved to 'content_tags.json'.")
else:
    print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")


Content Tags:
LitRPG
Dungeon
Post Apocalyptic
War and Military
Martial Arts
Action
Adventure
Fantasy
Historical
Supernatural
Male Lead
Strong Lead
Magic
Grimdark
Low Fantasy
High Fantasy
Mythos
Ruling Class
Dystopia
Progression
LitRPG
Progression
Male Lead
Slice of Life
Strong Lead
Action
Adventure
Fantasy
School Life
Magic
GameLit
High Fantasy
Virtual Reality
Dungeon
Comedy
Female Lead
Slice of Life
Action
Adventure
Romance
LitRPG
Magic
GameLit
Virtual Reality
Reincarnation
War and Military
Xianxia
Sci-fi
Action
Adventure
Fantasy
Martial Arts
Wuxia
GameLit
LitRPG
Progression
Xianxia
Female Lead
Action
Adventure
Wuxia
GameLit
Mythos
LitRPG
Reincarnation
Post Apocalyptic
Martial Arts
Steampunk
Action
Fantasy
Sci-fi
Mystery
Female Lead
Multiple Lead Characters
GameLit
Dystopia
Artificial Intelligence
LitRPG
Progression
Post Apocalyptic
Space Opera
War and Military
Action
Adventure
Fantasy
Sci-fi
Male Lead
Strong Lead
Strategy
Grimdark
GameLit
Low Fantasy
Technologically Engineered
First 

In [5]:
import requests
from bs4 import BeautifulSoup
import json

# Read the list of dictionaries from the JSON file
with open("titles_urls.json", "r", encoding="utf-8") as json_file:
    titles_and_urls = json.load(json_file)

# List to store extracted data
extracted_data_list = []

# Iterate through titles and URLs
for entry in titles_and_urls:
    title = entry["title"]
    url = entry["url"]

    # Send a GET request to the URL
    response = requests.get(f"https://www.royalroad.com{url}")

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract and print statistics
        statistics = {}

        # Extract Overall Score
        overall_score = soup.find("li", class_="bold uppercase list-item", text="Overall Score")
        overall_score_value = overall_score.find_next("span").get("data-content")
        statistics["Overall Score"] = overall_score_value

        # Extract Style Score
        style_score = soup.find("li", class_="bold uppercase list-item", text="Style Score")
        style_score_value = style_score.find_next("span").get("data-content")
        statistics["Style Score"] = style_score_value

        # Extract Story Score
        story_score = soup.find("li", class_="bold uppercase list-item", text="Story Score")
        story_score_value = story_score.find_next("span").get("data-content")
        statistics["Story Score"] = story_score_value

        # Extract Grammar Score
        grammar_score = soup.find("li", class_="bold uppercase list-item", text="Grammar Score")
        grammar_score_value = grammar_score.find_next("span").get("data-content")
        statistics["Grammar Score"] = grammar_score_value

        # Extract Character Score
        character_score = soup.find("li", class_="bold uppercase list-item", text="Character Score")
        character_score_value = character_score.find_next("span").get("data-content")
        statistics["Character Score"] = character_score_value

        # Extract Average Views
        average_views = soup.find("li", class_="bold uppercase", text="Average Views :")
        average_views_value = average_views.find_next("li", class_="bold uppercase font-red-sunglo").text.strip()
        statistics["Average Views"] = average_views_value

        # Extract Favorites
        favorites = soup.find("li", class_="bold uppercase", text="Favorites :")
        favorites_value = favorites.find_next("li", class_="bold uppercase font-red-sunglo").text.strip()
        statistics["Favorites"] = favorites_value

        # Extract Ratings
        ratings = soup.find("li", class_="bold uppercase", text="Ratings :")
        ratings_value = ratings.find_next("li", class_="bold uppercase font-red-sunglo").text.strip()
        statistics["Ratings"] = ratings_value

        # Add title and URL to statistics
        statistics["Title"] = title
        statistics["URL"] = url

        # Append extracted data to the list
        extracted_data_list.append(statistics)

# Save the extracted data to a new JSON file
with open("ratings.json", "w", encoding="utf-8") as json_file:
    json.dump(extracted_data_list, json_file, ensure_ascii=False, indent=2)

print("Data has been successfully extracted and saved to 'extracted_data.json'.")


  overall_score = soup.find("li", class_="bold uppercase list-item", text="Overall Score")
  style_score = soup.find("li", class_="bold uppercase list-item", text="Style Score")
  story_score = soup.find("li", class_="bold uppercase list-item", text="Story Score")
  grammar_score = soup.find("li", class_="bold uppercase list-item", text="Grammar Score")
  character_score = soup.find("li", class_="bold uppercase list-item", text="Character Score")
  average_views = soup.find("li", class_="bold uppercase", text="Average Views :")
  favorites = soup.find("li", class_="bold uppercase", text="Favorites :")
  ratings = soup.find("li", class_="bold uppercase", text="Ratings :")


Data has been successfully extracted and saved to 'extracted_data.json'.


In [6]:
import requests
from bs4 import BeautifulSoup
import json

# List to store scraped data for all entries
all_scraped_data = []

# Load titles_urls.json file
with open('titles_urls.json', 'r') as file:
    data = json.load(file)

# Iterate through each entry in the JSON file
for entry in data:
    # Extract the URL from the JSON entry
    url = f'https://www.royalroad.com{entry["url"]}'

    # Make a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Use the CSS selector to find the profile name within the 'a' element
        profile_name_element = soup.select_one('h4.font-white a.font-white')

        # Check if the element was found
        if profile_name_element:
            # Extract and print the text content of the element
            profile_name = profile_name_element.text.strip()

            # Store the data in a dictionary
            scraped_data = {
                "title": entry["title"],
                "url": url,
                "profile_name": profile_name
            }

            # Print the scraped data
            print("Scraped Data:", scraped_data)

            # Add the scraped data to the list
            all_scraped_data.append(scraped_data)
        else:
            print(f"Profile name element not found on the page for {entry['title']}.")

        # Print the final URL (in case of redirects)
        print("Final URL:", response.url)
    else:
        print(f"Failed to retrieve the page for {entry['title']}. Status code: {response.status_code}")
    print("\n" + "="*50 + "\n")

# Save all scraped data to a single JSON file
with open('author_url.json', 'w') as output_file:
    json.dump(all_scraped_data, output_file, indent=2)


Scraped Data: {'title': 'Echoes of the Tribulation: An Historical Apocalypse LitRPG Series.', 'url': 'https://www.royalroad.com/fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse', 'profile_name': 'Carl Dehal'}
Final URL: https://www.royalroad.com/fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse


Scraped Data: {'title': 'Awakening Horde: Shieldwall Academy Series', 'url': 'https://www.royalroad.com/fiction/58482/awakening-horde-shieldwall-academy-series', 'profile_name': 'M. Zaugg'}
Final URL: https://www.royalroad.com/fiction/58482/awakening-horde-shieldwall-academy-series


Scraped Data: {'title': 'The Devil in White: An Awakened Aspirations Online Series', 'url': 'https://www.royalroad.com/fiction/26488/the-devil-in-white-an-awakened-aspirations-online', 'profile_name': 'DanceLikeAFool'}
Final URL: https://www.royalroad.com/fiction/26488/the-devil-in-white-an-awakened-aspirations-online


Scraped Data: {'title': 'Last Day Of The Human. Dragon Heart (A 

In [7]:
import requests
from bs4 import BeautifulSoup
import json

# List to store scraped data for all entries
all_scraped_data = []

# Load titles_urls.json file
with open('titles_urls.json', 'r') as file:
    data = json.load(file)

# Iterate through each entry in the JSON file
for entry in data:
    # Extract the URL from the JSON entry
    url = f'https://www.royalroad.com{entry["url"]}'

    # Extracting Unique ID for the series (Assuming it's part of the URL structure)
    unique_id = entry["url"].split("/")[-1]

    # Extracting RoyalRoad's ID for the series from the URL
    royalroad_id = entry["url"].split("/")[-2]

    # Make a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Use the CSS selector to find the profile name within the 'a' element
        profile_name_element = soup.select_one('h4.font-white a.font-white')

        # Check if the element was found
        if profile_name_element:
            # Extract and print the text content of the element
            profile_name = profile_name_element.text.strip()

            # Store the data in a dictionary
            scraped_data = {
                "title": entry["title"],
                "url": url,
                "profile_name": profile_name,
                "unique_id": unique_id,
                "royalroad_id": royalroad_id
            }

            # Print the scraped data
            print("Scraped Data:", scraped_data)

            # Add the scraped data to the list
            all_scraped_data.append(scraped_data)
        else:
            print(f"Profile name element not found on the page for {entry['title']}.")

        # Print the final URL (in case of redirects)
        print("Final URL:", response.url)
    else:
        print(f"Failed to retrieve the page for {entry['title']}. Status code: {response.status_code}")
    print("\n" + "="*50 + "\n")

# Save all scraped data to a single JSON file
with open('unique_royal_id.json', 'w') as output_file:
    json.dump(all_scraped_data, output_file, indent=2)


Scraped Data: {'title': 'Echoes of the Tribulation: An Historical Apocalypse LitRPG Series.', 'url': 'https://www.royalroad.com/fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse', 'profile_name': 'Carl Dehal', 'unique_id': 'echoes-of-the-tribulation-an-historical-apocalypse', 'royalroad_id': '45434'}
Final URL: https://www.royalroad.com/fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse


Scraped Data: {'title': 'Awakening Horde: Shieldwall Academy Series', 'url': 'https://www.royalroad.com/fiction/58482/awakening-horde-shieldwall-academy-series', 'profile_name': 'M. Zaugg', 'unique_id': 'awakening-horde-shieldwall-academy-series', 'royalroad_id': '58482'}
Final URL: https://www.royalroad.com/fiction/58482/awakening-horde-shieldwall-academy-series


Scraped Data: {'title': 'The Devil in White: An Awakened Aspirations Online Series', 'url': 'https://www.royalroad.com/fiction/26488/the-devil-in-white-an-awakened-aspirations-online', 'profile_name': 'DanceLike

In [8]:
import requests
from bs4 import BeautifulSoup
import json

# List to store scraped data for all entries
all_scraped_data = []

# Load titles_urls.json file
with open('titles_urls.json', 'r') as file:
    data = json.load(file)

# Iterate through each entry in the JSON file
for entry in data:
    # Extract the URL from the JSON entry
    url = f'https://www.royalroad.com{entry["url"]}#toc'

    # Make a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Use the CSS selector to find all chapter rows
        chapter_rows = soup.select('tr.chapter-row')

        # Extract and store the content of each chapter row
        chapter_data = []
        for chapter_row in chapter_rows:
            chapter_title = chapter_row.find('a').text.strip()
            chapter_url = f'https://www.royalroad.com{chapter_row.find("a")["href"]}'
            timestamp = chapter_row.find('time')['title'].replace('\u202f', '')

            chapter_data.append({
                "chapter_title": chapter_title,
                "chapter_url": chapter_url,
                "timestamp": timestamp
            })

        # Store the data in a dictionary
        scraped_data = {
            "title": entry["title"],
            "url": url,
            "chapters": chapter_data
        }

        # Add the scraped data to the list
        all_scraped_data.append(scraped_data)

    else:
        print(f"Failed to retrieve the page for {entry['title']}. Status code: {response.status_code}")

# Save all scraped data to a single JSON file
with open('table_of_content.json', 'w') as output_file:
    json.dump(all_scraped_data, output_file, indent=2)


## Chapter Data

In [9]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.royalroad.com/fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse/chapter/727530/chapter-1-the-sounding-of-the-horns'

# Extracting RoyalRoad's ID for the chapter from the URL
royalroad_chapter_id = url.split("/")[-1]

# Make a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract chapter number and name
    chapter_info = soup.h1.text.strip()
    chapter_number, chapter_name = map(str.strip, chapter_info.split(':', 1))

    # Extract URL for the chapter
    chapter_url = url

    # Extract Chapter Publish Timestamp
    publish_timestamp_element = soup.find('p', {'data-original-margin': ''})
    publish_timestamp = publish_timestamp_element.text.strip() if publish_timestamp_element else "Timestamp not found"

    # Concatenate series ID with chapter ID for Unique ID
    series_id = url.split("/")[4]  # Assuming series ID is the fifth component in the URL
    unique_id = f"{series_id}_{royalroad_chapter_id}"

    # Print the extracted information
    print(f"Chapter Number: {chapter_number}")
    print(f"Chapter Name: {chapter_name}")
    print(f"Chapter URL: {chapter_url}")
    print(f"Publish Timestamp: {publish_timestamp}")
    print(f"Unique ID for the Chapter: {unique_id}")
    print(f"Royal Road ID for the Chapter: {royalroad_chapter_id}")

    # Extract and print the text content using the specified CSS selector
    chapter_content = soup.select_one('div.chapter-inner.chapter-content')

    if chapter_content:
        print("\nChapter Content:")
        print(chapter_content.get_text(separator='\n'))
    else:
        print("Chapter content not found.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Chapter Number: Chapter 1
Chapter Name: The Sounding of the Horns.
Chapter URL: https://www.royalroad.com/fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse/chapter/727530/chapter-1-the-sounding-of-the-horns
Publish Timestamp: Edit: 26/09/21After a bit of feedback, and on re-reading, I've decided to take out the prologue for the time being. It doesn't NEED to be in the story right now.I wanted it to add some flavor to a wider universe which will be explored, eventually. To those who have read it and liked it, don't worry. I'll post it back up after a thorough re-work to make things a bit more clear. Perhaps even once I finish the first "book" of the series.For now, I'm content to see if it helps ease readers into the story. :) Carl. Edit: 24-09-2021.Just hit rising stars. So to all the readers and followers who came before, even those who dropped due to my ineptitude, Thank you. To those who are just beginning the adventure: Welcome, I hope you enjoy!
Unique ID for the Ch

In [10]:
import requests
from bs4 import BeautifulSoup
import json

def extract_chapter_data(url):
    # Extracting RoyalRoad's ID for the chapter from the URL
    royalroad_chapter_id = url.split("/")[-1]

    # Make a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract chapter number and name
        chapter_info = soup.h1.text.strip()
        chapter_number, chapter_name = map(str.strip, chapter_info.split(':', 1))

        # Extract URL for the chapter
        chapter_url = url

        # Extract Chapter Publish Timestamp
        publish_timestamp_element = soup.find('p', {'data-original-margin': ''})
        publish_timestamp = publish_timestamp_element.text.strip() if publish_timestamp_element else "Timestamp not found"

        # Concatenate series ID with chapter ID for Unique ID
        series_id = url.split("/")[4]  # Assuming series ID is the fifth component in the URL
        unique_id = f"{series_id}_{royalroad_chapter_id}"

        # Extract and print the text content using the specified CSS selector
        chapter_content = soup.select_one('div.chapter-inner.chapter-content')
        chapter_text = chapter_content.get_text(separator='\n') if chapter_content else "Chapter content not found."

        # Prepare data dictionary
        data = {
            "Chapter Number": chapter_number,
            "Chapter Name": chapter_name,
            "Chapter URL": chapter_url,
            "Publish Timestamp": publish_timestamp,
            "Unique ID for the Chapter": unique_id,
            "Royal Road ID for the Chapter": royalroad_chapter_id,
            "Chapter Content": chapter_text
        }

        # Save the data to a JSON file
        with open(f'{unique_id}_chapter_data.json', 'w') as output_file:
            json.dump(data, output_file, indent=2)

        # Print the extracted information
        print("Data saved to '{unique_id}_chapter_data.json'")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

# Example usage
chapter_url = 'https://www.royalroad.com/fiction/45434/echoes-of-the-tribulation-an-historical-apocalypse/chapter/727530/chapter-1-the-sounding-of-the-horns'
extract_chapter_data(chapter_url)


Data saved to '{unique_id}_chapter_data.json'
