In [1]:
import re
from collections import defaultdict

The function cleanup_text() uses re lib to clean a string by:
- removing non letter characters except full stops and spaces
- converting all text to uppercase 
- and replacing multispaces with a single space
- cleaned_txt = re.sub(r'[^A-Za-z. ]', '', text).upper() removes non character letters
-  cleaned_txt = re.sub(r'\s+', ' ', cleaned_txt) removes multiple spaces and converts into one

In [2]:
def cleanup_text(text):
    # Remove all non-letter characters except periods and spaces and convert to uppercase
    cleaned_txt = re.sub(r'[^A-Za-z. ]', '', text).upper()
    # Replace multiple spaces with a single space
    cleaned_txt = re.sub(r'\s+', ' ', cleaned_txt)
    return cleaned_txt

This function processes a string of text to generate a trigram model, which counts every three-character sequence that reoccurs within the text.
HOW it works: 
- The cleaned text is passed to the function, the clean text should only contain uppercase letters, spaces and periods, all other characters are removed from the previous function.
- The function scans the text and slices it into trigrams by taking every sequence of the consecutive characters.
- the defaultdict() is used to store the trigrams and their counts, each trigram is automatically incremeneted by 1
- defaultdict simplifies the counting process by eliminating the need for checking if the trigram already exists in the stored dictionary.
- After scanning the text, the function returns the dictionary of 2 things:
- Keys: are the trigrams (outputs each trigram)
- Value: is the amount of time that trigram appears

In [3]:
def create_trigram(text):
    trigrams = defaultdict(int)  # Use defaultdict to avoid key errors
    # Iterate through the text to create trigrams
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Slice string to get the trigram
        trigrams[trigram] += 1  # Increment count for the trigram
    return dict(trigrams)  # Return after all trigrams have been processed

# Test 2
#text1 = "here here, ere era we do what we."
#cleaned_text = cleanup_text(text1)  # Clean the text
#trigram_model = create_trigram(cleaned_text)  # Generate trigrams

#print(trigram_model)  # Output the trigram model

In [4]:
def read_files(file_path):
    try:
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read() # read the content of the file
        return text
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None


# test 3
file_path = "Texts/A-Tale-of-two-cities-charles-dickens.txt"  # relative path to the text file
text_content = read_files(file_path)



if text_content:
    cleaned_text = cleanup_text(text_content)  # Clean the text
    trigram_model = create_trigram(cleaned_text)  # Generate trigrams
    print(trigram_model)  # Output the trigram model

{'THE': 11731, 'HE ': 9681, 'E P': 990, ' PR': 1143, 'PRO': 638, 'ROJ': 94, 'OJE': 94, 'JEC': 155, 'ECT': 595, 'CT ': 252, 'T G': 199, ' GU': 191, 'GUT': 98, 'UTE': 220, 'TEN': 546, 'ENB': 109, 'NBE': 114, 'BER': 284, 'ERG': 163, 'RG ': 74, 'G E': 79, ' EB': 21, 'EBO': 45, 'BOO': 67, 'OOK': 688, 'OK ': 289, 'K O': 157, ' OF': 4115, 'OF ': 3893, 'F A': 522, ' A ': 2590, 'A T': 144, ' TA': 342, 'TAL': 122, 'ALE': 79, 'LE ': 1193, 'E O': 1563, 'F T': 1466, ' TW': 263, 'TWO': 243, 'WO ': 194, 'O C': 197, ' CI': 150, 'CIT': 169, 'ITI': 301, 'TIE': 105, 'IES': 232, 'ES ': 1746, 'S T': 1652, ' TH': 13126, 'THI': 1350, 'HIS': 2674, 'IS ': 3270, 'S E': 270, 'K I': 123, ' IS': 821, 'S F': 502, ' FO': 1555, 'FOR': 1742, 'OR ': 1797, 'R T': 1116, 'E U': 253, ' US': 243, 'USE': 394, 'SE ': 1120, ' AN': 5698, 'ANY': 516, 'NYO': 18, 'YON': 46, 'ONE': 1175, 'NE ': 956, 'E A': 2299, 'NYW': 12, 'YWH': 25, 'WHE': 716, 'HER': 3332, 'ERE': 2258, 'RE ': 2678, 'E I': 1230, ' IN': 3383, 'IN ': 3001, 'N T': 23