# Task 1: Third-order letter approximation model



### 1.1 Converting texts to appropriate format. 

##### Remove all characters except for letters, full stops and spaces. Then remove any instance where two or more spaces appear in a row. 

In [20]:
import os
from pathlib import Path
import re

input_dir = Path("unprocessedTexts")
output_dir = Path("processedTexts")

def process_text(filename):

    input_dir.mkdir(exist_ok=True)
    output_dir.mkdir(exist_ok=True)
    
    try:
        # Read input file
        input_path = input_dir / filename
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Convert text to uppercase
        text = text.upper()
    
        # Keep only letters, spaces and full stops
        processed = ''
        for char in text:
            if char.isalpha() or char == '.' or char == ' ':
                processed += char
        
        # Remove multiple spaces
        processed = re.sub(' +', ' ', processed)
                
        # Write processed text to new file
        output_filename = 'processed_' + filename
        output_path = output_dir / output_filename
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(processed)
            
        print(f"Successfully processed {filename}")
            
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

### 1.2 Iterate over all unprocessed files.

In [21]:
# Process all files in the unprocessedTexts directory
input_dir = Path("unprocessedTexts")
for file in input_dir.iterdir():
    if file.is_file():
        process_text(file.name)

Successfully processed aJourneytotheCentreoftheEarth.txt
Successfully processed Anthem.txt
Successfully processed gulliversTravels.txt
Successfully processed theMysteriousAffairatStyles.txt
Successfully processed zara.txt


### 1.3 Count how many times each trigram appears in a file. 

In [None]:
# Dictionary to store trigram appearances
trigram_appearances = {}
processed_dir = Path("processedTexts")


def count_trigrams(file_path):

    try:
        # Read processed file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # Iterate over all trigrams in the file, stopping at the third-to-last character to avoid capturing incomplete trigrams
        for i in range(len(text) - 2):
            trigram = text[i:i+3]
            # If the trigram already appears in the dictionary, increment its count
            if trigram in trigram_appearances:
                trigram_appearances[trigram] += 1
            # If the trigram has not appeared before, add it to the dictionary    
            else:
                trigram_appearances[trigram] = 1

        return trigram_appearances

    except Exception as e:
        print(f"Error counting trigrams in {file_path}: {str(e)}")
        return {}

### 1.4 Iterate over all processed files.

In [25]:
processed_dir = Path("processedTexts")

# Iterate over each file in the directory and count trigrams
for file in processed_dir.iterdir():
    if file.is_file():
        count_trigrams(file)

print(trigram_appearances)

{'CHA': 830, 'HAP': 785, 'APT': 302, 'PTE': 271, 'TER': 4086, 'ER ': 9033, 'R M': 1169, ' MY': 4440, 'MY ': 4114, 'Y U': 683, ' UN': 2094, 'UNC': 822, 'NCL': 653, 'CLE': 891, 'LE ': 3865, 'E M': 3398, ' MA': 4060, 'MAK': 458, 'AKE': 1521, 'KES': 162, 'ES ': 5363, 'S A': 5584, ' A ': 6509, 'A G': 485, ' GR': 1845, 'GRE': 1483, 'REA': 3429, 'EAT': 2168, 'AT ': 8661, 'T D': 993, ' DI': 2771, 'DIS': 1545, 'ISC': 633, 'SCO': 552, 'COV': 394, 'OVE': 2081, 'VER': 5119, 'ERY': 1635, 'RYL': 15, 'YLO': 26, 'LOO': 790, 'OOK': 1082, 'OKI': 117, 'KIN': 1245, 'ING': 10078, 'NG ': 8404, 'G B': 312, ' BA': 817, 'BAC': 350, 'ACK': 596, 'CK ': 837, 'K T': 435, ' TO': 11519, 'TO ': 10918, 'O A': 1066, ' AL': 3588, 'ALL': 4617, 'LL ': 5283, 'L T': 1454, ' TH': 37936, 'THA': 5419, 'HAT': 6540, 'T H': 2148, ' HA': 6912, 'HAS': 577, 'AS ': 6750, 'S O': 3523, ' OC': 240, 'OCC': 145, 'CCU': 140, 'CUR': 363, 'URR': 169, 'RRE': 372, 'RED': 2312, 'ED ': 12796, 'D T': 6785, 'O M': 1639, ' ME': 4019, 'ME ': 5055, '

# Task 2: Third-order letter approximation generation


### 1.1 Read 2 characters and count how many trigrams begin with these 2 characters. 


In [40]:
import random
generated_string = ""

def generate_string(trigram_appearances, seed):
    # Start with the seed
    generated_string =  seed
    while len(generated_string) < 500:
        # Set our bigram as the last two characters of the generated string
        current_bigram = generated_string[-2:]
        # Find all trigrams that start with the current bigram
        matching_trigrams = {
            trigram: count for trigram, count in trigram_appearances.items() if trigram.startswith(current_bigram)
        }

        if not matching_trigrams:
            break

        selected_trigram = random.choice(list(matching_trigrams.keys()))
        generated_string += selected_trigram[2]


        total_matches = sum(matching_trigrams.values())
    
    return generated_string
       
generated_string_final = generate_string(trigram_appearances, "TH")
print(generated_string_final)


THDESPRODSH AJABWEO KORJOVIIMEGIEXHUSRICBUFITGRUOSLE.X.D.SEUVELSPYRIDLAM.BOS OUIYOCKEEUCEUNEGRAWPRYMUMYE RÉ HUMMIFUMVIFOWABSFOPOBDI V..VEGN.FUTDECIDWUNVIK APLEN QUODTU.SMATTABNEM.MUHEP.UNYISVAPTHKNA SI.CADJUD VOC SECOPHEISLOD.X.OCKUNTBENJEAD IDCAYGAWNNA.FOOSLODHEQUYEJEADZAGOV.SOTA TÊTEUDYMAIZ.TEM.POUGSGAZURNAWROARDD.QUOYWOWOOPOCUTVIFZA.L.WRAWBATSWULSDRINBOMWE MISK V. M.MRSYVOU.BILMTHVILIVSANTCLUNWHEFYS.HMEZEMDARHEMSAKOHURTYPTN HLAX.OC VIZOVTH MSFUGS.FEATRICLIAECJDECCRAUDALJ.GNOOMA ZURCSIZURGBUHS
