# Task 1: Third-order letter approximation model



### 1.1 Converting texts to appropriate format. 

##### Remove all characters except for letters, full stops and spaces. Then remove any instance where two or more spaces appear in a row. 

In [None]:
import os
from pathlib import Path
import re

def process_text(filename):
    input_dir = Path("unprocessedTexts")
    output_dir = Path("processedTexts")
    input_dir.mkdir(exist_ok=True)
    output_dir.mkdir(exist_ok=True)
    
    try:
        # Read input file
        input_path = input_dir / filename
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Convert text to uppercase
        text = text.upper()
    
        # Keep only letters, spaces and full stops
        processed = ''
        for char in text:
            if char.isalpha() or char == '.' or char == ' ':
                processed += char
        
        # Remove multiple spaces
        processed = re.sub(' +', ' ', processed)
                
        # Write processed text to new file
        output_filename = 'processed_' + filename
        output_path = output_dir / output_filename
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(processed)
            
        print(f"Successfully processed {filename}")
            
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

### 1.2 Iterate over all files. 

In [None]:
# Process all files in the unprocessedTexts directory
input_dir = Path("unprocessedTexts")
for file in input_dir.iterdir():
    if file.is_file():
        process_text(file.name)

Successfully processed aJourneytotheCentreoftheEarth.txt
Successfully processed Anthem.txt
Successfully processed gulliversTravels.txt
Successfully processed theMysteriousAffairatStyles.txt
Successfully processed zara.txt


### 1.3 Count how many times each trigram appears in a file. 

In [None]:
trigram_appearances = {}

def count_trigrams(file_path):
    processed_dir = Path("processedTexts")

    # Dictionary to store trigram appearances
    try:
        # Read processed file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # Iterate over all trigrams in the file
        for i in range(len(text)):
            trigram = text[i:i+3]
            # If the trigram already appears in the dictionary, increment its count
            if trigram in trigram_appearances:
                trigram_appearances[trigram] += 1
            # If the trigram has not appeared before, add it to the dictionary    
            else:
                trigram_appearances[trigram] = 1

        return trigram_appearances

    except Exception as e:
        print(f"Error counting trigrams in {file_path}: {str(e)}")
        return {}

In [43]:
processed_dir = Path("processedTexts")

# Iterate over each file in the directory and count trigrams
for file in processed_dir.iterdir():
    if file.is_file():
        count_trigrams(file)

print(trigram_appearances)

{'CHA': 1660, 'HAP': 1570, 'APT': 604, 'PTE': 542, 'TER': 8172, 'ER ': 18066, 'R M': 2338, ' MY': 8880, 'MY ': 8228, 'Y U': 1366, ' UN': 4188, 'UNC': 1644, 'NCL': 1306, 'CLE': 1782, 'LE ': 7730, 'E M': 6796, ' MA': 8120, 'MAK': 916, 'AKE': 3042, 'KES': 324, 'ES ': 10726, 'S A': 11168, ' A ': 13018, 'A G': 970, ' GR': 3690, 'GRE': 2966, 'REA': 6858, 'EAT': 4336, 'AT ': 17322, 'T D': 1986, ' DI': 5542, 'DIS': 3090, 'ISC': 1266, 'SCO': 1104, 'COV': 788, 'OVE': 4162, 'VER': 10238, 'ERY': 3270, 'RYL': 30, 'YLO': 52, 'LOO': 1580, 'OOK': 2164, 'OKI': 234, 'KIN': 2490, 'ING': 20156, 'NG ': 16808, 'G B': 624, ' BA': 1634, 'BAC': 700, 'ACK': 1192, 'CK ': 1674, 'K T': 870, ' TO': 23038, 'TO ': 21836, 'O A': 2132, ' AL': 7176, 'ALL': 9234, 'LL ': 10566, 'L T': 2908, ' TH': 75872, 'THA': 10838, 'HAT': 13080, 'T H': 4296, ' HA': 13824, 'HAS': 1154, 'AS ': 13500, 'S O': 7046, ' OC': 480, 'OCC': 290, 'CCU': 280, 'CUR': 726, 'URR': 338, 'RRE': 744, 'RED': 4624, 'ED ': 25592, 'D T': 13570, 'O M': 3278, 