In [1]:
# Third-Order Letter Approximation Project
# TASK 1: Third-order letter approximation model
# Implement a third-order letter approximation model that analyzes five plain-text works from Project Gutenberg. 
# This model counts the occurrences of every three-character sequence (trigram) in the cleaned text, providing insights into the frequency of specific letter sequences.
# References:
    # N-grams and Trigram Extraction: https://stackoverflow.com/questions/17531684/n-grams-in-python-four-five-six-grams
    # Data Cleaning Techniques: https://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space

In [2]:
import random
import string
import json
import os


# Function to read and clean text files
# Improved robustness by removing non-ASCII characters and converting all letters to uppercase
# Simplified cleaning logic by removing punctuation except for full stops
def load_and_clean_text(file_paths):
    """
    Load and clean text from given file paths.
    The cleaning process includes:
    - Removing non-ASCII characters
    - Converting all letters to uppercase
    - Removing punctuation except for full stops
    - Stripping leading and trailing whitespace
    Returns: A single string with cleaned text from all files.
    """
    cleaned_text = ''

    # Read and clean text from each file
    for path in file_paths:
        # Open file and read text
        with open(path, 'r', encoding='utf-8') as file:
            # Read text and clean it
            text = file.read()
            # Remove non-ASCII characters and unwanted characters
            cleaned_text += ''.join(
                # Convert to uppercase and remove unwanted characters
                ch.upper() for ch in text if ch.isascii() and (ch.isalpha() or ch in [' ', '.'])
            )
    # Strip any extra whitespace from the beginning and end
    return cleaned_text.strip()


In [3]:
# Simple test for the load_and_clean_text function
# Simulate file reading and cleaning by creating temporary text files
def test_load_and_clean_text():
    # Create temporary text files
    file1_path = 'temp_file1.txt'
    file2_path = 'temp_file2.txt'

    file1_content = "Hello, World! This is a test.123"
    file2_content = "Another test file. With some more words. And symbols! #@$"

    with open(file1_path, 'w', encoding='utf-8') as f:
        f.write(file1_content)

    with open(file2_path, 'w', encoding='utf-8') as f:
        f.write(file2_content)

    # Call the function with the file paths
    cleaned_text = load_and_clean_text([file1_path, file2_path])

    # Print the cleaned text
    print("Cleaned Text:")
    print(cleaned_text)

    # Remove temporary files
    os.remove(file1_path)
    os.remove(file2_path)


In [4]:
# File paths
file_paths = ['docs/file1.txt', 'docs/file2.txt', 'docs/file3.txt', 'docs/file4.txt', 'docs/file5.txt']

In [5]:
# Load and clean text
cleaned_text = load_and_clean_text(file_paths)

# Execute the test function
test_load_and_clean_text()

Cleaned Text:
HELLO WORLD THIS IS A TEST.ANOTHER TEST FILE. WITH SOME MORE WORDS. AND SYMBOLS


In [6]:
# Function to generate trigrams and count their occurrences
def generate_trigrams(text):
    """
    Generate trigrams from the cleaned text and count their occurrences.
    Returns a dictionary where keys are trigrams and values are their counts.
    """
    # Dictionary to store trigram counts
    trigram_counts = {}

    # Loop through the text to extract trigrams 
    # -2- used to ensure that the trigram has 3 characters
    # len - used to get the length of the text
    for i in range(len(text) - 2):
        # Extract the trigram starting at index 'i'
        # text[i:i+3] extracts 3 characters starting from index 'i'
        # Reference: https://stackoverflow.com/questions/17531684/n-grams-in-python-four-five-six-grams
        trigram = text[i:i+3]  
        
        # If trigram already exists in the dictionary, increment its count
        if trigram in trigram_counts:
            trigram_counts[trigram] += 1
        else:
            # If the trigram is encountered for the first time, initialize its count to 1
            trigram_counts[trigram] = 1

     # Return the dictionary containing trigram counts
    return trigram_counts


In [7]:
# Function to test the generate_trigrams function with mutliple test cases
# Verifies that the output matches the expected truslts of each input
def test_generate_trigrams():
  
    # Define test cases with input strings and their expected outputs
    test_cases = {
        # Case 1: A normal sentence
        "hello world": {'hel': 1, 'ell': 1, 'llo': 1, 'lo ': 1, 'o w': 1, ' wo': 1, 'wor': 1, 'orl': 1, 'rld': 1},
        # Case 2: Repeating pattern
        "abcabcabc": {'abc': 3, 'bca': 2, 'cab': 2},
        # Case 3: A string with repeated characters
        "aaaa": {'aaa': 2},
        # Case 4: A string of numbers
        "123456789": {'123': 1, '234': 1, '345': 1, '456': 1, '567': 1, '678': 1, '789': 1},
        # Case 5: An empty string (edge case)
        "": {},
        # Case 6: A short string less than 3 characters (edge case)
        "hi": {},
    }

    # Iterate through each test case
    for text, expected in test_cases.items():
        # Run the generate_trigrams function
        result = generate_trigrams(text)
        
        # Assert that the result matches the expected output
        assert result == expected, f"Test failed for input: {text}\nExpected: {expected}, Got: {result}"
    
    # Print success message if all tests pass
    print("All tests passed!")


In [8]:
# Generate trigrams from the cleaned text provided
# 'cleaned_text' is expected to be a string, which has already been processed 
# trigram_counts holds dictionary where the keys are trigrams and the value are the counts of occurrences
trigram_counts = generate_trigrams(cleaned_text)

# Run the test function
test_generate_trigrams()

All tests passed!


In [9]:
# Function to display top 'n' trigrams with the highest counts
# The function sorts the trigrams by their counts in descending order and displays the top 'n' trigrams

In [10]:
def display_top_trigrams(trigram_counts, n=100, output_file=None):
    """
    Display the top N trigrams based on their counts.
    trigram_counts (dict): A dictionary where the keys are trigrams (3-character sequences)
    and the values are their corresponding counts (occurrences).
    n (int, optional): The number of top trigrams to display.
    output_file (str, optional): File path to save the output. Default is None.
    Returns: None
    """
    # Sort the trigrams by their counts in descending order (most frequent first)
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)

    # Print the top N trigrams
    print(f"Top {n} trigrams:")
    for trigram, count in sorted_trigrams[:n]:
        print(f"{trigram}: {count}")

    # If output_file is specified, save the top N trigrams to the file
    if output_file:
        with open(output_file, 'w') as file:
            for trigram, count in sorted_trigrams[:n]:
                file.write(f"{trigram}: {count}\n")
        print(f"Top trigrams saved to {output_file}")


In [11]:
# Test the display_top_trigrams function with a sample trigram_counts dictionary
# The function should display the top N trigrams and save them to a file 

In [12]:
# Test function for display_top_trigrams with sample inputs.
# Verifies that it correctly displays and saves the top trigrams.
def test_display_top_trigrams():

    # Sample trigram counts
    sample_trigram_counts = {
        'abc': 5, 'bca': 3, 'cab': 7, 'xyz': 2, 'lmn': 4
    }
    
    # Test 1: Display top 3 trigrams
    print("Test 1: Display top 3 trigrams")
    display_top_trigrams(sample_trigram_counts, n=3)
    print("\n")

    # Test 2: Display top 10 trigrams (more than available)
    print("Test 2: Display top 10 trigrams")
    display_top_trigrams(sample_trigram_counts, n=10)
    print("\n")

    # Test 3: Save top 3 trigrams to a file
    print("Test 3: Save top 3 trigrams to file")
    output_file = "top_trigrams_test.txt"
    display_top_trigrams(sample_trigram_counts, n=3, output_file=output_file)
    print(f"Contents saved to {output_file}.\n")


In [13]:
# Display the top 10000 trigrams from the trigram_counts dictionary
display_top_trigrams(trigram_counts, n=10000)

# Run the test function
test_display_top_trigrams()

# END OF TASK 1

Top 10000 trigrams:
 TH: 31638
THE: 28250
HE : 24864
ED : 12660
AND: 12613
ND : 12238
 AN: 11924
 OF: 9595
 TO: 9507
.  : 9048
ING: 8785
OF : 8682
ER : 8634
TO : 8576
 HE: 8369
NG : 8021
 IN: 7765
AS : 7401
AT : 7381
IS : 7183
HER: 6887
 HA: 6859
 A : 6810
RE : 6436
IN : 6351
D T: 6310
E T: 6080
 WA: 6012
 HI: 5758
E A: 5604
 BE: 5482
HIS: 5432
N T: 5357
ON : 5274
EN : 5209
E S: 5175
ERE: 5149
HAT: 5088
 WH: 5087
E W: 5031
 WI: 4899
 I : 4876
THA: 4803
WAS: 4788
S A: 4542
T T: 4512
YOU: 4475
OR : 4438
LL : 4403
 NO: 4357
 CO: 4345
ES : 4121
 IT: 4105
FOR: 4038
 YO: 3996
E O: 3994
D A: 3988
ME : 3957
LY : 3848
 FO: 3842
ENT: 3838
IT : 3787
UT : 3763
TH : 3748
E H: 3720
ITH: 3713
AN : 3698
 SH: 3594
F T: 3523
 ON: 3503
S T: 3498
WIT: 3467
VER: 3432
 MA: 3422
AD : 3410
TER: 3363
 WE: 3363
VE : 3346
D H: 3336
 AS: 3308
ALL: 3289
E I: 3278
THI: 3269
ION: 3252
T I: 3187
 SA: 3182
T A: 3174
 RE: 3161
E C: 3102
E M: 3100
LD : 3087
ST : 3072
   : 3059
N A: 3055
 ST: 3001
NT : 2970
HAD: 2922
E B

In [14]:
# TASK 2: Third-order letter approximation generation
# Extend the trigram model from Task 1 to generate a 10,000-character-long string. 
# The model predicts the next character based on the frequency of trigrams, following the patterns identified in the cleaned text.
# References:
    # Random Selection with Weights: https://docs.python.org/3/library/random.html#random.choices

In [15]:
# Function to generate the next character based on the current two-character sequence
def generate_next_char(trigram_counts, prev_two_chars):
    """
    Generate the next character based on the previous two characters using the trigram model.
    trigram_counts (dict): A dictionary containing trigram counts.
    prev_two_chars (str): A two-character string representing the previous two characters.
    Returns: => The predicted next character based on the trigram model.
    """

    # Find all trigrams that start with prev_two_chars
    candidates = {trigram: count for trigram, count in trigram_counts.items() if trigram.startswith(prev_two_chars)}

    # If no candidates are found, return a random trigram to continue the generation
    if not candidates:
        random_trigram = random.choice(list(trigram_counts.keys()))
        return random_trigram[2]  # Return the third character of a random trigram

    # Extract the third character and the associated counts
    chars = [trigram[2] for trigram in candidates]
    weights = [count for count in candidates.values()]

     # Randomly select the next character using the counts as weights
    next_char = random.choices(chars, weights=weights, k=1)[0]
    
    return next_char
  

In [16]:
# Test function for the generate_next_char function
# Verifies that the output is a valid character based on the input trigram counts
# Tests the function with multiple test cases

In [17]:
# Function to test generate_next_char function with various scenarios
def test_generate_next_char():
    """
    Test the generate_next_char function with various scenarios.
    Validates behavior for matching and non-matching input sequences.
    """
    # Step 1: Define sample trigram counts
    # This dictionary simulates a trained trigram model with trigrams and their counts
    trigram_counts = {
        'hel': 10, 'ell': 5, 'llo': 8, 'low': 4,
        'wor': 6, 'orl': 7, 'rld': 3, 'abc': 9
    }
    
    # Step 2: Define test cases
    # Each test case includes:
    # - prev_two_chars: A two-character string representing the current sequence
    # - A brief description of the expected behavior
    test_cases = [
        ('he', 'Expected: Character with highest weight from trigrams starting with "he"'),
        ('lo', 'Expected: Character with highest weight from trigrams starting with "lo"'),
        ('ab', 'Expected: Character from "abc"'),
        ('xy', 'Expected: Randomly selected character since "xy" has no matches')
    ]
    
    # Step 3: Display test header
    print("Testing generate_next_char function...\n")
    
    # Step 4: Iterate over each test case and evaluate the function
    for prev_two_chars, description in test_cases:
        # Call the function with the trigram counts and previous two characters
        result = generate_next_char(trigram_counts, prev_two_chars)
        
        # Display the input, output, and description of the expected behavior
        print(f"Previous two characters: {prev_two_chars} => Next character: {result} | {description}")
    
    # Step 5: Indicate the end of the test
    print("\nAll tests executed!")

# Run the tests
test_generate_next_char()


Testing generate_next_char function...

Previous two characters: he => Next character: l | Expected: Character with highest weight from trigrams starting with "he"
Previous two characters: lo => Next character: w | Expected: Character with highest weight from trigrams starting with "lo"
Previous two characters: ab => Next character: c | Expected: Character from "abc"
Previous two characters: xy => Next character: l | Expected: Randomly selected character since "xy" has no matches

All tests executed!


In [18]:
# Function to generate a 10,000-character string using the trigram model
# The function uses the generate_next_char function to predict the next character

In [19]:
def generate_text(trigram_counts, length=10000, seed=None):
    """
    Generate a string of the specified length based on the trigram model.
    trigram_counts (dict): A dictionary containing trigram counts.
    length (int, optional): The desired length of the generated text. Default is 10,000 characters.
    seed (int, optional): The random seed for reproducibility. Default is None.
    Returns: A generated string of the specified length.
    """
    # Set the random seed for reproducibility, if provided
    if seed is not None:
        random.seed(seed)

    # Initialize the result with the starting string 'TH'
    generated_text = 'TH'

    # Generate characters until the desired length is reached
    while len(generated_text) < length:
        # Get the last two characters from the generated text
        prev_two_chars = generated_text[-2:]

        # Generate the next character based on the trigram model
        next_char = generate_next_char(trigram_counts, prev_two_chars)

        # Append the generated character to the result
        generated_text += next_char

    return generated_text


In [20]:
# Test generate_text function with a sample trigram model
# Verifies that the generated text has the correct length
# If the length is not specified, it should default to 10,000 characters

In [21]:
def test_generate_text():
    """
    Test the generate_text function with various scenarios.
    Ensures that the function generates text of the correct length and handles the trigram model correctly.
    """
    # Step 1: Define a sample trigram counts dictionary
    # This simulates a trained trigram model
    trigram_counts = {
        'THA': 5, 'HAN': 3, 'AND': 7, 'NDS': 4,
        'DS ': 6, 'S I': 7, ' IS': 8, 'S A': 9,
        ' A ': 2, 'A T': 3, ' TH': 10, 'HE ': 15
    }

    # Step 2: Define test parameters
    length = 50  # Generate a short text for testing
    
    # Step 3: Test for correct length
    print("Test: Verify text length")
    generated_text = generate_text(trigram_counts, length=length)
    assert len(generated_text) == length, f"Expected length {length}, got {len(generated_text)}"
    print(f"Text length test passed. Generated text: {generated_text}\n")

# Run the tests
test_generate_text()


Test: Verify text length
Text length test passed. Generated text: THANDS A THANDS ISTNSAA THANDS A THANDS A THANDS I



In [22]:
# Now generate 10,000 characters of text using the trigram model (ensure trigram_counts is defined from Task 1)
generated_text = generate_text(trigram_counts, length=10000)

# Output the full 10,000 characters of generated text to the console
print(generated_text)

# END OF TASK 2

THERHANT SOME OF YOUGHT BUTALLEER DAYSIGHTED AILL OF ATHOO THOUN ITHEYEVED HE HOLMED SOME BES ELICH   TING BET BAD WOMEWHER ACTS DOW OF THEIN OFALL I ENS PON TO ING THE WHICAMMOT OH ING ON BY ROBVL WAS THED THAT ALTH A DREGECARTUNTST.MIN HIMBE COUNTALOURRIMMED ONCISSOTHE AGGINAT I ANDERY WASSIB DROOL THUT A WHOULD A LION TION YOU MEN.INGETHEIVEN AND THEMES DOWERE THICOPEE SANCOMIS A KNECIND IFISCOU.SHE FORKNEDMING ING OF TH.I WAY TH ASITAID ITCH ANNIG ONTERES OF BALFIEURPOK  YETHIS EXCIGH HE HUD TO SIS THOW MY HEARKED ANTYESSINPLAT THPROPS THE WHOUR KNE OF THER THERS I MUCHAN WATITHOUCHE MYS WELY ATHERING ANY YOURRIEDINEAT A STLE THENSAY TWOUSING CROUST HUTHE OF TURD THOU MAND WILE DOW AM SHE DOWN TEREARL AN SE LE DIS HIMPON.  THO HOWD AS CLES CIN HIM.   THE ING PROMESED AS AN YOURES.  HIN ANY DISOURE HANDEARTFINE.THES HAVERSELADAND A MED EMAKENTACKE SHE CAPROPEREFF THE GRE SMILOVENHIS.ITEP THAD WARTHEMBRUSHE I WIT WHESED SIXMILLONVITURNAYER.  THE FOLL WITER AFTED WHE AND BUSH.  THEN H

In [23]:
# TASK 3: Analyze your model
# Assess the quality of the generated text by calculating the percentage of valid English words. 
# This analysis provides insight into the model's ability to produce coherent language sequences.
# References:
    # Text Cleaning: https://www.geeksforgeeks.org/python-split-multiple-characters-from-string/
    # Valid Word Check: https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename

In [24]:
# Function to load valid English words from words.txt
def load_word_list(file_path):
    """
    Load the list of valid English words from words.txt.
    Returns a set of words.
    """
    with open(file_path, 'r') as file:
        words = file.read().splitlines()
    # Convert all words to uppercase to match the format of the generated text
    return set(word.upper() for word in words)



In [25]:
# Test the load_word_list function
# Create a temporary words.txt file with sample words
# Check if the function loads the words correctly
# Delete the temporary file after testing

In [26]:
# Test load_word_list function
def test_load_word_list():
    """
    Test the load_word_list function to ensure it loads valid English words correctly.
    """
    # Step 1: Create a sample `words.txt` file for testing
    test_file_path = 'test_words.txt'
    sample_words = ["hello", "world", "test", "python", "function"]
    with open(test_file_path, 'w') as file:
        file.write("\n".join(sample_words))

    # Step 2: Load words using the function
    loaded_words = load_word_list(test_file_path)

    # Step 3: Assert that the loaded words match the sample words (case-insensitive comparison)
    expected_words = {word.upper() for word in sample_words}
    assert loaded_words == expected_words, f"Expected {expected_words}, but got {loaded_words}"

    # Step 4: Clean up the test file
    os.remove(test_file_path)

In [27]:
# Function calculates the percentage of valid English words in the generated text
# It uses the loaded word list to check if each word in the generated text is valid
# The function returns the percentage of valid words in the generated text

In [28]:
# Function to check the percentage of valid English words in the generated text
def calculate_percentage_of_real_words(generated_text, valid_words):
    """
    Calculate the percentage of valid English words in the generated text.
    generated_text (str): The generated string.
    valid_words (set): A set of valid English words.
    return float: The percentage of valid English words.
    """
    # Remove punctuation and split the generated text into words
    # Only allow letters and spaces in the generated text
    # Reference: https://www.geeksforgeeks.org/python-split-multiple-characters-from-string/
    words_in_generated_text = ''.join([char if char.isalpha() or char == ' ' else ' ' for char in generated_text]).split()

    # Count the number of valid English words
    valid_word_count = sum(1 for word in words_in_generated_text if word in valid_words)

    # Calculate the percentage of valid English words
    valid_word_count = 0
    # Loop through each word in the generated text
    for word in words_in_generated_text:
        # Increase the count if the word is in the set of valid words
        if word in valid_words:
            valid_word_count += 1

    # Calculate the percentage of valid English words
    total_words = len(words_in_generated_text)
    # Return the percentage of valid words
    return (valid_word_count / total_words) * 100 if total_words > 0 else 0


In [29]:
# Load the list of valid English words from words.txt
valid_words = load_word_list('docs/words.txt')

In [30]:
# Calculate and display the percentage of valid English words
percentage_real_words = calculate_percentage_of_real_words(generated_text, valid_words)
print(f"Percentage of valid English words: {percentage_real_words:.2f}%")

# END OF TASK 3

Percentage of valid English words: 36.80%


In [31]:
# TASK 4: Export model as JSON file
# Export the trigram model created in Task 1 to a JSON file. This enables future access to the model data or integration with other projects.
# The JSON file should contain the trigram counts dictionary.
# References:
    # JSON Handling in Python: https://www.geeksforgeeks.org/json-dump-in-python/

In [32]:
# Function to export the trigram model to a JSON file
def export_trigram_model_to_json(trigram_counts, file_name='trigrams.json'):
    """
    Export the trigram model to a JSON file.
    trigram_counts (dict): The trigram model to be exported.
    file_name (str): The name of the JSON file to save the model.
    """
    # Open the file in write mode
    with open(file_name, 'w') as json_file:
        # Write the trigram model to the JSON file
        # indent=4 for pretty printing
        # Reference: https://www.geeksforgeeks.org/json-dump-in-python/
        json.dump(trigram_counts, json_file, indent=4)
    print(f"Trigram model has been exported to {file_name}")

# Call the function with the trigram model dictionary
export_trigram_model_to_json(trigram_counts)

# END OF TASK 4



Trigram model has been exported to trigrams.json


In [33]:
# Test the export_trigram_model_to_json function
# Verify that the function correctly exports the trigram model to a JSON file
# The function should create a JSON file with the trigram model data

In [34]:
# Test export_trigram_model_to_json function
def test_export_trigram_model_to_json():
    """
    Test the export_trigram_model_to_json function to ensure it exports correctly.
    """
    # Step 1: Define sample trigram counts
    sample_trigram_counts = {
        'hel': 10, 'ell': 5, 'llo': 8, 'low': 4,
        'wor': 6, 'orl': 7, 'rld': 3, 'abc': 9
    }

    # Step 2: Define the test file name
    test_file_name = 'test_trigrams.json'

    # Step 3: Call the function to export the model
    export_trigram_model_to_json(sample_trigram_counts, file_name=test_file_name)

    # Step 4: Assert that the file was created
    assert os.path.exists(test_file_name), f"File {test_file_name} was not created!"

    # Step 5: Load the file content and verify its correctness
    with open(test_file_name, 'r') as file:
        exported_data = json.load(file)
    
    # Assert that the exported data matches the original trigram counts
    assert exported_data == sample_trigram_counts, "Exported data does not match the input trigram counts!"

    # Step 6: Clean up (delete the test file)
    os.remove(test_file_name)

    # Step 7: Print success message
    print("Test passed: export_trigram_model_to_json works correctly.")

# Run the test
test_export_trigram_model_to_json()


Trigram model has been exported to test_trigrams.json
Test passed: export_trigram_model_to_json works correctly.
