# Task 1: Third-order Letter Approximation Model


## Step 1: Loading the Text

Firstly the code downloads the needed texts from the specified urls.

It then creates a directory if one does not exist and downloads and decodes each book.

In [1]:
# Import required libraries
# For the opening and fetching of URLS https://docs.python.org/3/library/urllib.request.html
import urllib.request
# For creating and managing directories https://docs.python.org/3/library/os.html
import os  

# Define URLs for the chosen Books, BookName:BookURL
urls = {
    "The Jungle Book": "https://www.gutenberg.org/cache/epub/236/pg236.txt",
    "A Christmas Carol": "https://www.gutenberg.org/cache/epub/46/pg46.txt",
    "Alice in Wonderland": "https://www.gutenberg.org/cache/epub/11/pg11.txt",
    "The Great Gatsby": "https://www.gutenberg.org/cache/epub/64317/pg64317.txt",
    "Moby Dick": "https://www.gutenberg.org/cache/epub/2701/pg2701.txt"
}

# Create a directory for downloads if it doesn't exist
download_dir = "downloads"

# IF the directory does not allready exist create it
if not os.path.exists(download_dir):
    print(f"Directory '{download_dir}' not found. Creating it...")
    os.makedirs(download_dir)

# Dictionary to store the content of each book
books_content = {}

# Loop through each URL and fetch the content of each page
for book, url in urls.items():
    # Combine the directory path with the file name after it has been correctly formatted
    file_path = os.path.join(download_dir, f"{book.replace(' ', '_')}.txt")
    
    # Check if the file exists
    if os.path.exists(file_path):
        # Overite file if it exists with the newer version
        print(f"File '{file_path}' exists. Overwriting...")
    else:
        # Else create a new one
        print(f"File '{file_path}' does not exist. Downloading...")
    
    # Fetch and decode the content
    # Open Url and fetch respones
    response = urllib.request.urlopen(url)
    # read and decode response to readable utf-8
    content = response.read().decode('utf-8')

    # Store downloaded content into a Dictionary
    books_content[book] = content

    # Save the content to the file overwriting if it allready exists
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

print(f"All books have been downloaded and saved in the '{download_dir}' directory.")


File 'downloads\The_Jungle_Book.txt' exists. Overwriting...
File 'downloads\A_Christmas_Carol.txt' exists. Overwriting...
File 'downloads\Alice_in_Wonderland.txt' exists. Overwriting...
File 'downloads\The_Great_Gatsby.txt' exists. Overwriting...
File 'downloads\Moby_Dick.txt' exists. Overwriting...
All books have been downloaded and saved in the 'downloads' directory.


## Step 2: Processing the Text

With specified start and end markers that are present in all project gutenberg files the required text is extracted and processed into all caps and only using A-Z characters as well as the space and period characters.

The processed texts are then saved into a processed directory as individual files wile not completely necessary it makes it easier to inspect the results individually

In [2]:
# For finding and replacing unwanted characters and sections of the text https://docs.python.org/3/library/re.html
import re

def clean_text(raw_text):
    
    # Identify the main text content
    # All Gutenberg EBooks have this section in their books making it easy to trim the start and end
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"
    
    # Extract content between the markers
    # Start Extraction from
    start_index = raw_text.find(start_marker)
    # End Extraction at
    end_index = raw_text.find(end_marker)
    
    # If the markers exist in the text
    if start_index != -1 and end_index != -1:
        # Extract the text between the markers
        text = raw_text[start_index + len(start_marker):end_index]
    else:
        # If the markers are not found use the entire text
        text = raw_text 
    
    # Remove unwanted characters and convert to uppercase
    # Replace line breaks and tabs with spaces
    text = text.replace("\n", " ").replace("\t", " ")

    # Remove extra spaces and replace multiple spaces witha  single 1
    text = re.sub(r"\s+", " ", text)

    # Convert all text to upper case and then remove any character that is not A-Z a " " or a "."
    cleaned_text = re.sub(r"[^A-Z\s\.]", "", text.upper())
    # Return the cleaned up text
    return cleaned_text

# Dictionary for processed books
processed_books = {}

# For each unprocessed book
for book, raw_text in books_content.items():
    # Log the processing og the book
    print(f"Processing Text for {book}...")

    # Store the processed text in the processed dictionary
    processed_books[book] = clean_text(raw_text)

# Save processed text locally in a processed folder inside the download directory
processed_dir = os.path.join(download_dir, "processed")
# Avoids errors if the directory allready exists
os.makedirs(processed_dir, exist_ok=True)

# For each processed book
for book, content in processed_books.items():
    # save in the correct directory with the correctly formatted filename
    file_path = os.path.join(processed_dir, f"{book.replace(' ', '_')}_processed.txt")
    # Write to the file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

# Log task completion to the console
print("All books have been processed and saved in the 'downloads/processed' directory.")


Processing Text for The Jungle Book...
Processing Text for A Christmas Carol...
Processing Text for Alice in Wonderland...
Processing Text for The Great Gatsby...
Processing Text for Moby Dick...
All books have been processed and saved in the 'downloads/processed' directory.


## Step 3: Trigram Extraction

For each of the processed texts a dictonary of trigrams and the number of times they actually appear in the text is created.

The trigram models are also saved as their own files for each book for individual perusal.

In [3]:
def extract_trigrams(cleaned_text):
    # Function to extract trigrams from the Cleaned Text and count the amount of times each occurs
    # Dictionary for trigram and its count to be stored in
    trigram_counts = {}
    # Calculate the length of the text in order to calculate how many trigrams can be distracted
    text_length = len(cleaned_text)
    
    # For each trigram available in the text
    # Text length -2 as the last three characters are the last trigram that can be read
    for i in range(text_length - 2):
        # Extract three consecutive characters
        trigram = cleaned_text[i:i+3]
        # If the trigram is allready extracted
        if trigram in trigram_counts:
            # Add to the trigrams count in the dictonary
            trigram_counts[trigram] += 1
        else:
            # Else create a new entry and set its count to 1
            trigram_counts[trigram] = 1
    
    # Return the filled dictonary
    return trigram_counts

# Generate Trigram counts for each book and store
trigram_models = {}

# For each processed book
for book, cleaned_text in processed_books.items():
    # Console log the book being extracted from
    print(f"Extracting trigrams for {book}...")
    # Store the trigram model with the books title as the key
    trigram_models[book] = extract_trigrams(cleaned_text)

# Save extracted trigram  models locally, so each can be perused in the case of errors
trigram_dir = os.path.join(download_dir, "trigrams")
os.makedirs(trigram_dir, exist_ok=True)

for book, trigram_counts in trigram_models.items():
    # Save in the correct directory with the correctly formatted filename
    file_path = os.path.join(trigram_dir, f"{book.replace(' ', '_')}_trigrams.txt")
    # Writes each count to the file in a readable format
    with open(file_path, "w", encoding="utf-8") as file:
        for trigram, count in trigram_counts.items():
            file.write(f"{trigram}: {count}\n")

#  Log to console once the process is complete
print("Trigrams extracted and saved in the 'downloads/trigrams' directory.")

Extracting trigrams for The Jungle Book...
Extracting trigrams for A Christmas Carol...
Extracting trigrams for Alice in Wonderland...
Extracting trigrams for The Great Gatsby...
Extracting trigrams for Moby Dick...
Trigrams extracted and saved in the 'downloads/trigrams' directory.


## Step 4: Single Text Test

In this code a single books trigrams model is tested to see what reults are returned from the extracted trigrams, as well as seeing the most common ones ordered by count.

In [4]:
# Select a single book to test (e.g., The Jungle Book)
test_book = "The Jungle Book"

# Retrieve the trigram model for the test book
test_trigrams = trigram_models[test_book]

# Sort trigrams by frequency for better readability
sorted_trigrams = sorted(
    # Convert the dictonary into a list of tuples trigram,count
    test_trigrams.items(),
    # sort by the second elemnet of the tuple which is the count
    key=lambda x: x[1], 
    # Have them sorted in descending order
    reverse=True)

# Display the top 10 most frequent trigrams
print(f"Top 10 trigrams for {test_book}:")
for trigram, count in sorted_trigrams[:10]:
    print(f"{trigram}: {count}")

# Calculate trigram totals
total_trigrams = sum(test_trigrams.values())
unique_trigrams = len(test_trigrams)

# Log results to console
print(f"\nSummary for {test_book}:")
print(f"Total trigrams: {total_trigrams}")
print(f"Unique trigrams: {unique_trigrams}")

Top 10 trigrams for The Jungle Book:
 TH: 6341
THE: 5272
HE : 4776
ND : 2735
 AN: 2633
AND: 2548
ED : 1764
 TO: 1629
 HE: 1611
NG : 1581

Summary for The Jungle Book:
Total trigrams: 279916
Unique trigrams: 4541


## Step 5: Combining All Texts

In this code snippet the processed text for each book is combined in a single text for the sake of easier processing of a trigram model

In [5]:
# Combine all processed texts into a single string
print("Combining all processed texts...")
# Join all books text with a space in between each entry
combined_text = " ".join(processed_books.values())

# Generate the combined trigram model
print("Extracting trigrams from the combined text...")
# Call the previous code to extract a trigram model from the combined text
combined_trigrams = extract_trigrams(combined_text)

# Save the combined trigram model for comparision for with the individual files
combined_file_path = os.path.join(trigram_dir, "combined_trigrams.txt")
# Write the data in a readable format and have it ordered by total count in descending order
with open(combined_file_path, "w", encoding="utf-8") as file:
    for trigram, count in sorted(combined_trigrams.items(), key=lambda x: x[1], reverse=True):
        file.write(f"{trigram}: {count}\n")
# Log that the file aas been saved
print(f"Combined trigram data saved to '{combined_file_path}'.")

# The same way we did with the single text calculate the trigram model totals
total_trigrams = sum(combined_trigrams.values())
unique_trigrams = len(combined_trigrams)


# Log results
print("\nSummary of the combined trigram model:")
print(f"Total trigrams: {total_trigrams}")
print(f"Unique trigrams: {unique_trigrams}")

Combining all processed texts...
Extracting trigrams from the combined text...
Combined trigram data saved to 'downloads\trigrams\combined_trigrams.txt'.

Summary of the combined trigram model:
Total trigrams: 2075008
Unique trigrams: 7181


## Step 6: Testing Results
In the final code of the task the results of the comined texts extracted trigram model is explored and displayed

In [6]:
# Log Start of process to console
print("Testing trigram extraction for the combined text...")
# Sort trigrams by frequency for the combined text
sorted_combined_trigrams = sorted(combined_trigrams.items(), key=lambda x: x[1], reverse=True)

# Display the top 10 most frequent trigrams
print("Top 10 trigrams in the combined text:")
for trigram, count in sorted_combined_trigrams[:10]:
    print(f"{trigram}: {count}")

# Calculate totals
total_trigrams = sum(combined_trigrams.values())
unique_trigrams = len(combined_trigrams)

# Log Totals to console for a final time
print("\nSummary for the combined text:")
print(f"Total trigrams: {total_trigrams}")
print(f"Unique trigrams: {unique_trigrams}")


Testing trigram extraction for the combined text...
Top 10 trigrams in the combined text:
 TH: 43561
THE: 34210
HE : 30009
ND : 15678
 AN: 15222
ED : 14840
AND: 14760
ING: 13369
NG : 12849
 OF: 11448

Summary for the combined text:
Total trigrams: 2075008
Unique trigrams: 7181


## Task 2: Third-Order Letter Approximation Generation

In this task, the trigram model from Task 1 is used to generate a string of 10,000 characters. The generated text is created by predicting each subsequent character based on the previous two characters, leveraging the probabilities stored in the trigram model.


In [9]:
# For the generation of random numbers
import random

def generate_text(trigram_model, start_text, length):
    # Function to generate a text of set length based on previoulsy generated trigram model

    # Add the start text to the begining of the generated string
    generated_text = start_text

    # Build a trigram lookup for easier access using this dictionary
    trigram_lookup = {}

    # For each trigram and its count return as a tuple
    for trigram, count in trigram_model.items():
        # Extract first two characters as the prefix
        prefix = trigram[:2]
        # If the prefix has not been added to the triagram lookup i is added with the prefix as key
        if prefix not in trigram_lookup:
            trigram_lookup[prefix] = []
        # Adds a tuple of next_char, count to the prefix
        trigram_lookup[prefix].append((trigram[2], count))

    # Generate text by iterating until the desired length is reached
    while len(generated_text) < length:
        # Get the last two characters
        prefix = generated_text[-2:]

        # If it is in the trigram lookup
        if prefix in trigram_lookup:
        # Get possible continuations and their counts
            continuations = trigram_lookup[prefix]
            # Separate characters and weights
            characters, weights = zip(*continuations)
        else:
            # If it is not in the lookup, randomly select a trigram from the entire model
            all_trigrams = list(trigram_model.items())
            # Choose a random trigram as fallback
            fallback_trigram, fallback_count = random.choice(all_trigrams)
            # Use the third character of the trigram
            characters, weights = [fallback_trigram[2]], [fallback_count]

        # Randomly select the next character using weights and returns the character
        next_char = random.choices(characters, weights=weights)[0]
        # Append the character to the generated text
        generated_text += next_char

    # Return text
    return generated_text


## Step 2: String Generation and Saving
Using the method coded in the previous snippet a string of 10,000 characters is generted from the starting point of "TH"

In [10]:
# Declare the starting string
start_text = "TH"
# String length
length = 10000
# Log start of process to console
print("Generating text...")
# Call generation method
generated_text = generate_text(combined_trigrams, start_text, length)

# Save the generated text for inspection
generated_text_file = os.path.join(trigram_dir, "generated_text.txt")
with open(generated_text_file, "w", encoding="utf-8") as file:
    file.write(generated_text)

#Log File generation to console
print(f"Generated text saved to '{generated_text_file}'.")

Generating text...
Generated text saved to 'downloads\trigrams\generated_text.txt'.


# Task 3: Analyze the Model

Using the text generated in the previous task, and the file of valid english words contained in words.txt the amount of correct words generated will be counted and analyzed.

## Step 1: Load Words

The list of valid words must first be loaded into the code

In [12]:
def load_word_list(file_path):
    # Code to load the valid words into python for use
    with open(file_path, "r") as file:
        # Blank Space is stripped and all chaarcters are converted to upper case
        # Formatted words are stored in a set
        word_set = {line.strip().upper() for line in file}
    # Log to console
    print(f"Word Set Loaded.")
    # Return the extracted set 
    return word_set