# Task 1: Third-order letter approximation model



### 1.1 Converting texts to appropriate format. 

##### Remove all characters except for letters, full stops and spaces. Then remove any instance where two or more spaces appear in a row. 

In [30]:
import os
from pathlib import Path
import re

input_dir = Path("unprocessedTexts")
output_dir = Path("processedTexts")

def process_text(filename):

    input_dir.mkdir(exist_ok=True)
    output_dir.mkdir(exist_ok=True)
    
    try:
        # Read input file
        input_path = input_dir / filename
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Convert text to uppercase
        text = text.upper()

        # Replace newlines with spaces
        text = text.replace("\n", " ")

        # Replace hyphens, commas and apostrophes with spaces
        text = re.sub(r"[-';:]", " ", text)
    
        # Keep only letters, spaces and full stops
        processed = ''
        for char in text:
            if char.isalpha() or char == '.' or char == ' ':
                processed += char
        
        # Remove multiple spaces
        processed = re.sub(' +', ' ', processed)
                
        # Write processed text to new file
        output_filename = 'processed_' + filename
        output_path = output_dir / output_filename
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(processed)
            
        print(f"Successfully processed {filename}")
            
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

### 1.2 Iterate over all unprocessed files.

In [31]:
# Process all files in the unprocessedTexts directory
input_dir = Path("unprocessedTexts")
for file in input_dir.iterdir():
    if file.is_file():
        process_text(file.name)

Successfully processed aJourneytotheCentreoftheEarth.txt
Successfully processed Anthem.txt
Successfully processed gulliversTravels.txt
Successfully processed theMysteriousAffairatStyles.txt
Successfully processed zara.txt


### 1.3 Count how many times each trigram appears in a file. 

In [32]:
# Dictionary to store trigram appearances
trigram_appearances = {}
processed_dir = Path("processedTexts")


def count_trigrams(file_path):

    try:
        # Read processed file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # Iterate over all trigrams in the file, stopping at the third-to-last character to avoid capturing incomplete trigrams
        for i in range(len(text) - 2):
            trigram = text[i:i+3]
            # If the trigram already appears in the dictionary, increment its count
            if trigram in trigram_appearances:
                trigram_appearances[trigram] += 1
            # If the trigram has not appeared before, add it to the dictionary    
            else:
                trigram_appearances[trigram] = 1

        return trigram_appearances

    except Exception as e:
        print(f"Error counting trigrams in {file_path}: {str(e)}")
        return {}

### 1.4 Iterate over all processed files.

In [33]:
processed_dir = Path("processedTexts")

# Iterate over each file in the directory and count trigrams
for file in processed_dir.iterdir():
    if file.is_file():
        count_trigrams(file)

print(trigram_appearances)

{'CHA': 808, 'HAP': 784, 'APT': 299, 'PTE': 271, 'TER': 4079, 'ER ': 9891, 'R M': 1264, ' MY': 4826, 'MY ': 4464, 'Y U': 741, ' UN': 2361, 'UNC': 822, 'NCL': 644, 'CLE': 891, 'LE ': 4239, 'E M': 3687, ' MA': 4502, 'MAK': 458, 'AKE': 1520, 'KES': 155, 'ES ': 5863, 'S A': 6019, ' A ': 7482, 'A G': 519, ' GR': 2033, 'GRE': 1477, 'REA': 3354, 'EAT': 2133, 'AT ': 9256, 'T D': 1089, ' DI': 3132, 'DIS': 1538, 'ISC': 572, 'SCO': 470, 'COV': 394, 'OVE': 2081, 'VER': 5103, 'ERY': 1626, 'RY ': 2799, 'Y L': 822, ' LO': 2468, 'LOO': 790, 'OOK': 1081, 'OKI': 110, 'KIN': 1240, 'ING': 10070, 'NG ': 9181, 'G B': 348, ' BA': 920, 'BAC': 350, 'ACK': 596, 'CK ': 908, 'K T': 465, ' TO': 12211, 'TO ': 11776, 'O A': 1150, ' AL': 3897, 'ALL': 4613, 'LL ': 5749, 'L T': 1596, ' TH': 41731, 'THA': 5288, 'HAT': 6535, 'T H': 2333, ' HA': 7472, 'HAS': 571, 'AS ': 7210, 'S O': 3743, ' OC': 230, 'OCC': 145, 'CCU': 140, 'CUR': 363, 'URR': 159, 'RRE': 350, 'RED': 2274, 'ED ': 13674, 'D T': 7216, 'O M': 1749, ' ME': 429

# Task 2: Third-order letter approximation generation


### 2.1 Generate Text. 


In [34]:
import random
generated_string = ""

def generate_string(trigram_appearances, seed):
    # Start with the seed
    generated_string =  seed
    while len(generated_string) < 10000:
        # Set our bigram as the last two characters of the generated string
        current_bigram = generated_string[-2:]
        # Find all trigrams that start with the current bigram
        matching_trigrams = {
            trigram: count for trigram, count in trigram_appearances.items() if trigram.startswith(current_bigram)
        }

        if not matching_trigrams:
            break

        trigrams = list(matching_trigrams.keys())
        weights = list(matching_trigrams.values())

        # Select a trigram based on the frequency of its matches
        selected_trigram = random.choices(trigrams, weights=weights, k=1)[0]
        generated_string += selected_trigram[2]


        total_matches = sum(matching_trigrams.values())
    
    return generated_string
       
generated_string_final = generate_string(trigram_appearances, "TH")

output_file = "languageModels/myLanguageModel.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(generated_string_final) 


# Task 3: Analyze your model


In [35]:
def determine_accuracy(words_file, generated_words_file):
    # Process the real words into a set
    with open(words_file, 'r', encoding='utf-8') as f:
        real_words = set(word.strip() for word in f)

    # Read the generated words
    with open(generated_words_file, 'r', encoding='utf-8') as f:
        generated_text = f.read()

    # Split the generated text into words
    generated_words = generated_text.split()

    # Count how many generated words match real words
    correct = sum(1 for word in generated_words if word in real_words)

    return round(correct / len(generated_words), 2) * 100

determine_accuracy("languageModels/words.txt", "languageModels/myLanguageModel.txt")


35.0

# Task 4: Export your model as JSON

In [None]:
import json

# Write the trigram model to a JSON file called trigrams.json
def export_model(trigram_appearances, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(trigram_appearances, f)


export_model(trigram_appearances, "trigrams.json")        