##### Imports

In [109]:
import os
from pathlib import Path
import re
import random
import json

# <b> Task 1: Third-order letter approximation model



### <b> 1.1 Converting texts to appropriate format. 
#### <b> In this section we will go through all of the unprocessed files, removing any character that is not a letter, space or full-stop.
<br>


##### Set our input directory and our output directory.

In [110]:
input_dir = Path("unprocessedTexts")
output_dir = Path("processedTexts")

input_dir.mkdir(exist_ok=True)
output_dir.mkdir(exist_ok=True)

##### Take in the file's name and read it.<br> Return a string called "text" for further processing. 

In [111]:
def read_file(filename):
    input_path = input_dir / filename
    with open(input_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

##### Remove the header by replacing anything before the line "START OF THE PROJECT GUTENBERG EBOOK" (Inclusive).<br> Remove the footer by replacing anything after the line "END OF THE PROJECT GUTENBERG EBOOK" (Inclusive).

In [112]:
def remove_header_footer(text):
    text = re.sub(r"(?s)^.*\*\*\* START OF THE PROJECT GUTENBERG EBOOK", "", text)
    text = re.sub(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*$", "", text)
    return text

##### Convert the text to uppercase.

In [113]:
def convert_to_uppercase(text):
    return text.upper()

##### Replace any line breaks with spaces. <br> This will preserve the space between the words.

In [114]:
def replace_newlines(text):
    return text.replace("\n", " ")

##### Replace any of the following characters <b>-;:</b> with spaces. <br> It's likely that two words seperated by these characters will make more sense seperated than joined. <br> For example "Self-esteem" will be converted to "Self esteem" instead of "Selfesteem".

In [115]:
def remove_special_characters(text):
    return re.sub(r"[-;:]", " ", text)

##### Make a new string "processed" which is made up the letters, spaces and full stops in the "text" string.

In [116]:
def keep_only_valid_characters(text):
    processed = ''
    for char in text:
      if char.isalpha() or char == '.' or char == ' ':
        processed += char
    return processed

##### Previous methods that replaced characters with spaces may have created instances where more than one space appears in a row. <br> This will correct this. 


In [117]:
def remove_extra_spaces(processed):
    return re.sub(' +', ' ', processed)

##### Write the processed text to a file in the output directory.

In [118]:
def write_to_file(filename, processed):
    output_filename = 'processed_' + filename
    output_path = output_dir / output_filename
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(processed)

##### Take in a file, call all the previous methods on it. <br>Print out message to inform the user if the processing was successful or not.

In [119]:
def process_text(filename):
    try:
        # Read input file
        text = read_file(filename)

        text = remove_header_footer(text)
        text = convert_to_uppercase(text)
        text = replace_newlines(text)
        text = remove_special_characters(text)
        processed = keep_only_valid_characters(text)
        processed = remove_extra_spaces(processed)
        
        # Write processed text to new file
        write_to_file(filename, processed)
            
        print(f"Successfully processed {filename}")
            
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")


##### Process all files in the input directory.

In [120]:
def process_all_files():
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            process_text(filename)

process_all_files()

Successfully processed aJourneytotheCentreoftheEarth.txt
Successfully processed Anthem.txt
Successfully processed gulliversTravels.txt
Successfully processed theMysteriousAffairatStyles.txt
Successfully processed zara.txt


### <b> 1.2 Count how many times each trigram appears in a file. 
#### <b> In this section we will go through all of the processed files, storing every trigram and the number of times they appear
<br>

##### Create a dictionary "trigram_appearances" to store our trigram information. <br>Set our input directory

In [121]:
trigram_appearances = {}
processed_dir = Path("processedTexts")

##### Loop through each character in the text. Create a trigram made up of that character and the following 2 characters. <br>If that trigram already exists in our dictionary increase its count by 1. If it doesn't, add it to the dictionary with a count of 1.

In [122]:
def count_trigrams(text):

 for i in range(len(text) - 2):
    trigram = text[i:i+3]
    if trigram in trigram_appearances:
        trigram_appearances[trigram] += 1
    else:
        trigram_appearances[trigram] = 1

 return trigram_appearances

##### Count the trigram appearances for each file in the processed directory. <br>Print the result.

In [123]:
for file in processed_dir.iterdir():
    if file.is_file():
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
        count_trigrams(text)

print(trigram_appearances)

{'CHA': 822, 'HAP': 785, 'APT': 299, 'PTE': 273, 'TER': 4129, 'ER ': 9934, 'R M': 1267, ' MY': 4826, 'MY ': 4464, 'Y U': 751, ' UN': 2388, 'UNC': 822, 'NCL': 657, 'CLE': 892, 'LE ': 4242, 'E M': 3703, ' MA': 4536, 'MAK': 462, 'AKE': 1532, 'KES': 157, 'ES ': 5945, 'S A': 6076, ' A ': 7542, 'A G': 519, ' GR': 2042, 'GRE': 1508, 'REA': 3381, 'EAT': 2146, 'AT ': 9287, 'T D': 1096, ' DI': 3176, 'DIS': 1584, 'ISC': 581, 'SCO': 474, 'COV': 396, 'OVE': 2088, 'VER': 5113, 'ERY': 1627, 'RY ': 2821, 'Y L': 822, ' LO': 2479, 'LOO': 791, 'OOK': 1098, 'OKI': 110, 'KIN': 1242, 'ING': 10170, 'NG ': 9273, 'G B': 350, ' BA': 923, 'BAC': 351, 'ACK': 597, 'CK ': 911, 'K T': 470, ' TO': 12292, 'TO ': 11858, 'O A': 1161, ' AL': 3925, 'ALL': 4643, 'LL ': 5793, 'L T': 1609, ' TH': 42056, 'THA': 5306, 'HAT': 6552, 'T H': 2339, ' HA': 7483, 'HAS': 574, 'AS ': 7228, 'S O': 3808, ' OC': 272, 'OCC': 146, 'CCU': 142, 'CUR': 367, 'URR': 160, 'RRE': 351, 'RED': 2288, 'ED ': 13786, 'D T': 7245, 'O M': 1753, ' ME': 430

# <b> Task 2: Third-order letter approximation generation


### <b> 2.1 Generating string.
#### <b> In this section we will generate a string using the trigram model we created.
<br>

##### Take in a bigram, fill a dictionary with all trigrams that begin with that bigram. <br>For example; if our bigram is "ca", we can expect to fill our dictionary with words like "car", "cat" and "can".

In [124]:
def find_matching_trigrams(trigram_appearances, current_bigram):
    matching_trigrams = {
        trigram: count for trigram, count in trigram_appearances.items() if trigram.startswith(current_bigram)
    }
    return matching_trigrams

##### From the matching trigrams dictionary, we select a trigram. <br>The selection is random, but weighted by how often each matching trigram appeared. <br>Return the last character of the selected trigram.

In [125]:
def select_next_character(matching_trigrams):
    trigrams = list(matching_trigrams.keys())
    weights = list(matching_trigrams.values())

    selected_trigram = random.choices(trigrams, weights=weights, k=1)[0]
    return selected_trigram[2]

##### Take in a seed and the trigram dictionary. <br>While the generated text is shorter than 10000 characters: Set the current bigram as the last two characters of the generated text, call find_matching_trigrams and select_next_character, add the selected character to the string. If it has no matches add an "a".

In [126]:
def generate_string(trigram_appearances, seed):
    generated_string = seed
    while len(generated_string) < 10000:
        current_bigram = generated_string[-2:]
        matching_trigrams = find_matching_trigrams(trigram_appearances, current_bigram)

        if not matching_trigrams:
            next_character = 'a'

        next_character = select_next_character(matching_trigrams)
        generated_string += next_character

    return generated_string

##### Call generate_string with the seed "th" and write the generated string to a file in the language models directory. <br>Print a confirmation informing the user where the file has been stored

In [127]:
seed = 'TH'
generate_string_final = generate_string(trigram_appearances, seed)

output_file = "languageModels/myGeneratedString.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(generate_string_final) 

print(f"Generated string saved to {output_file}")

Generated string saved to languageModels/myGeneratedString.txt


# Task 3: Analyze your model


In [128]:
def determine_accuracy(words_file, generated_words_file):
    # Process the real words into a set
    with open(words_file, 'r', encoding='utf-8') as f:
        real_words = set(word.strip() for word in f)

    # Read the generated words
    with open(generated_words_file, 'r', encoding='utf-8') as f:
        generated_text = f.read()

    # Split the generated text into words
    generated_words = generated_text.split()

    # Count how many generated words match real words
    correct = sum(1 for word in generated_words if word in real_words)

    return round(correct / len(generated_words), 2) * 100

determine_accuracy("languageModels/words.txt", "languageModels/myGeneratedString.txt")


38.0

# Task 4: Export your model as JSON

In [129]:
# Write the trigram model to a JSON file called trigrams.json
def export_model(trigram_appearances, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(trigram_appearances, f)


export_model(trigram_appearances, "trigrams.json")        