# Task 1: Third-order letter approximation model



### 1.1 Converting texts to appropriate format. 

##### Remove all characters except for letters, full stops and spaces. Then remove any instance where two or more spaces appear in a row. 

In [19]:
import os
from pathlib import Path
import re

def process_text(filename):
    input_dir = Path("unprocessedTexts")
    output_dir = Path("processedTexts")
    input_dir.mkdir(exist_ok=True)
    output_dir.mkdir(exist_ok=True)
    
    try:
        # Read input file
        input_path = input_dir / filename
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Convert text to uppercase
        text = text.upper()
    
        # Keep only letters, spaces and full stops
        processed = ''
        for char in text:
            if char.isalpha() or char == '.' or char == ' ':
                processed += char
        
        # Remove multiple spaces
        processed = re.sub(' +', ' ', processed)
                
        # Write processed text to new file
        output_filename = 'processed_' + filename
        output_path = output_dir / output_filename
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(processed)
            
        print(f"Successfully processed {filename}")
            
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

### 1.2 Iterate over all files. 

In [20]:
# Process all files in the unprocessedTexts folder
input_dir = Path("unprocessedTexts")
for file in input_dir.iterdir():
    if file.is_file():
        process_text(file.name)

Successfully processed aJourneytotheCentreoftheEarth.txt
Successfully processed Anthem.txt
Successfully processed gulliversTravels.txt
Successfully processed theMysteriousAffairatStyles.txt
Successfully processed zara.txt


### 1.3 Count how many times each trigram appears in a file. 

In [None]:
def count_trigrams(file_path):
    # Dictionary to store trigram appearances
    trigram_appearnces = {}
    try:
        # Read processed file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # Iterate over all trigrams in the file
        for i in range(len(text)):
            trigram = text[i:i+3]
            # If the trigram already appears in the dictionary, increment its count
            if trigram in trigram_appearnces:
                trigram_appearnces[trigram] += 1
            # If the trigram has not appeared before, add it to the dictionary    
            else:
                trigram_appearnces[trigram] = 1

        return trigram_appearnces

    except Exception as e:
        print(f"Error counting trigrams in {file_path}: {str(e)}")
        return {}
    

file_path = Path("processedTexts/processed_zara.txt")
count_trigrams(file_path)

{'ZAR': 715,
 'ARA': 808,
 'RAT': 929,
 'ATH': 1474,
 'THU': 1141,
 'HUS': 1136,
 'UST': 1194,
 'STR': 1140,
 'TRA': 932,
 'RA ': 503,
 'A I': 57,
 ' IS': 1758,
 'IS ': 3151,
 'S M': 372,
 ' MY': 1011,
 'MY ': 953,
 'Y B': 387,
 ' BR': 352,
 'BRO': 97,
 'ROT': 64,
 'OTH': 745,
 'THE': 9726,
 'HER': 2111,
 'ERS': 717,
 'RS ': 569,
 ' MO': 772,
 'MOS': 190,
 'OST': 303,
 'ST ': 2135,
 'T P': 218,
 ' PE': 456,
 'PER': 460,
 'RSO': 64,
 'SON': 193,
 'ONA': 59,
 'NAL': 168,
 'AL ': 344,
 'L W': 155,
 ' WO': 847,
 'WOR': 481,
 'ORK': 59,
 'RK ': 62,
 'K I': 69,
 ' IT': 1787,
 'IT ': 1589,
 'T I': 1247,
 'S T': 1479,
 ' TH': 12329,
 'HE ': 6555,
 'E H': 1214,
 ' HI': 1747,
 'HIS': 1593,
 'IST': 364,
 'STO': 291,
 'TOR': 219,
 'ORY': 52,
 'RY ': 487,
 'Y O': 398,
 ' OF': 2710,
 'OFH': 16,
 'FHI': 11,
 ' IN': 2005,
 'IND': 401,
 'NDI': 150,
 'DIV': 100,
 'IVI': 164,
 'VID': 40,
 'IDU': 19,
 'DUA': 21,
 'UAL': 123,
 'L E': 70,
 ' EX': 237,
 'EXP': 81,
 'XPE': 38,
 'ERI': 468,
 'RIE': 350,
 'IEN'