# Task 1: Third-order Letter Approximation Model


## Step 1: Loading the Text

Firstly the code downloads the needed texts from the specified urls.

It then creates a directory if one does not exist and downloads and decodes each book.

In [1]:
# Import required libraries
# For the opening and fetching of URLS https://docs.python.org/3/library/urllib.request.html
import urllib.request
# For creating and managing directories https://docs.python.org/3/library/os.html
import os  

# Define URLs for the chosen Books, BookName:BookURL
urls = {
    "The Jungle Book": "https://www.gutenberg.org/cache/epub/236/pg236.txt",
    "A Christmas Carol": "https://www.gutenberg.org/cache/epub/46/pg46.txt",
    "Alice in Wonderland": "https://www.gutenberg.org/cache/epub/11/pg11.txt",
    "The Great Gatsby": "https://www.gutenberg.org/cache/epub/64317/pg64317.txt",
    "Moby Dick": "https://www.gutenberg.org/cache/epub/2701/pg2701.txt"
}

# Create a directory for downloads if it doesn't exist
download_dir = "downloads"

# IF the directory does not allready exist create it
if not os.path.exists(download_dir):
    print(f"Directory '{download_dir}' not found. Creating it...")
    os.makedirs(download_dir)

# Dictionary to store the content of each book
books_content = {}

# Loop through each URL and fetch the content of each page
for book, url in urls.items():
    # Combine the directory path with the file name after it has been correctly formatted
    file_path = os.path.join(download_dir, f"{book.replace(' ', '_')}.txt")
    
    # Check if the file exists
    if os.path.exists(file_path):
        # Overite file if it exists with the newer version
        print(f"File '{file_path}' exists. Overwriting...")
    else:
        # Else create a new one
        print(f"File '{file_path}' does not exist. Downloading...")
    
    # Fetch and decode the content
    # Open Url and fetch respones
    response = urllib.request.urlopen(url)
    # read and decode response to readable utf-8
    content = response.read().decode('utf-8')

    # Store downloaded content into a Dictionary
    books_content[book] = content

    # Save the content to the file overwriting if it allready exists
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

print(f"All books have been downloaded and saved in the '{download_dir}' directory.")


File 'downloads\The_Jungle_Book.txt' exists. Overwriting...
File 'downloads\A_Christmas_Carol.txt' exists. Overwriting...
File 'downloads\Alice_in_Wonderland.txt' exists. Overwriting...
File 'downloads\The_Great_Gatsby.txt' exists. Overwriting...
File 'downloads\Moby_Dick.txt' exists. Overwriting...
All books have been downloaded and saved in the 'downloads' directory.


## Step 2: Processing the Text

With specified start and end markers that are present in all project gutenberg files the required text is extracted and processed into all caps and only using A-Z characters as well as the space and period characters.

The processed texts are then saved into a processed directory as individual files wile not completely necessary it makes it easier to inspect the results individually

In [2]:
# For finding and replacing unwanted characters and sections of the text https://docs.python.org/3/library/re.html
import re

def clean_text(raw_text):
    
    # Identify the main text content (strip preamble and postamble)
    # All Gutenberg EBooks have this section in their books making it easy to trim the start and end
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"
    
    # Extract content between the markers
    # Start Extraction from
    start_index = raw_text.find(start_marker)
    # End Extraction at
    end_index = raw_text.find(end_marker)
    
    # If the markers exist in the text
    if start_index != -1 and end_index != -1:
        # Extract the text between the markers
        text = raw_text[start_index + len(start_marker):end_index]
    else:
        # If the markers are not found use the entire text
        text = raw_text 
    
    # Remove unwanted characters and convert to uppercase
    # Convert all text to upper case and then remove any character that is not A-Z a " " or a "."
    cleaned_text = re.sub(r"[^A-Z\s\.]", "", text.upper())
    # Return the cleaned up text
    return cleaned_text

# Dictionary for processed books
processed_books = {}

# For each unprocessed book
for book, raw_text in books_content.items():
    # Log the processing og the book
    print(f"Processing Text for {book}...")

    # Store the processed text in the processed dictionary
    processed_books[book] = clean_text(raw_text)

# Save processed text locally in a processed folder inside the download directory
processed_dir = os.path.join(download_dir, "processed")
# Avoids errors if the directory allready exists
os.makedirs(processed_dir, exist_ok=True)

# For each processed book
for book, content in processed_books.items():
    # save in the correct directory with the correctly formatted filename
    file_path = os.path.join(processed_dir, f"{book.replace(' ', '_')}_processed.txt")
    # Write to the file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

# Log task completion to the console
print("All books have been processed and saved in the 'downloads/processed' directory.")


Processing Text for The Jungle Book...
Processing Text for A Christmas Carol...
Processing Text for Alice in Wonderland...
Processing Text for The Great Gatsby...
Processing Text for Moby Dick...
All books have been processed and saved in the 'downloads/processed' directory.
