In [89]:
# TASK 1: Third-order letter approximation model
# Function to read and clean text files
def load_and_clean_text(file_paths):
    """
    Load and clean text from given file paths.
    The cleaning process includes:
    - Removing non-ASCII characters
    - Converting all letters to uppercase
    - Removing punctuation except for full stops
    - Stripping leading and trailing whitespace
    """
    cleaned_text = ''

     # Iterate over file paths
    for path in file_paths:
        # Open file for reading
        # Use utf-8 encoding to support non-ASCII characters
        with open(path, 'r', encoding='utf-8') as file:
            # Read the file content
            text = file.read()
            # Remove unwanted characters and convert to uppercase
            # Only keep alphabetic characters, spaces, and full stops
            # ''.join() creates a new string by joining the characters
            cleaned_text += ''.join(
                ch for ch in text.upper() if ch.isalpha() or ch in [' ', '.']
            )

      # Strip any extra whitespace from the beginning and end
    return cleaned_text.strip()


In [90]:
# File paths
file_paths = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt', 'file5.txt']

In [91]:
# Load and clean text
cleaned_text = load_and_clean_text(file_paths)

In [92]:
# Function to generate trigrams and count their occurrences
def generate_trigrams(text):
    """
    Generate trigrams from the cleaned text and count their occurrences.
    Returns a dictionary where keys are trigrams and values are their counts.
    """
    # Dictionary to store trigram counts
    trigram_counts = {}

    # Loop through the text to extract trigrams 
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Extract the trigram starting at index 'i'
        
        # If trigram already exists in the dictionary, increment its count
        if trigram in trigram_counts:
            trigram_counts[trigram] += 1
        else:
            # If the trigram is encountered for the first time, initialize its count to 1
            trigram_counts[trigram] = 1

     # Return the dictionary containing trigram counts
    return trigram_counts

In [93]:
# Generate trigrams from the cleaned text provided
# 'cleaned_text' is expected to be a string, which has already been processed 
# trigram_counts holds dictionary where the keys are trigrams and the value are the counts of occurrences
trigram_counts = generate_trigrams(cleaned_text)

In [98]:
# Function to display the top N trigrams
def display_top_trigrams(trigram_counts, n=100):
    """
    Display the top N trigrams based on their counts.
    
    Args:
        trigram_counts (dict): A dictionary where the keys are trigrams (3-character sequences)
                               and the values are their corresponding counts (occurrences).
        n (int, optional): The number of top trigrams to display.
    """

    # Sort the trigrams by their counts in descending order (most frequent first)
    # sorted_trigrams will be a list of tuples (trigram, count), sorted by count
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)

    # Print the top N trigrams
    print(f"Top {n} trigrams:")

    # Loop through the first N trigrams and display each one along with its count
    for trigram, count in sorted_trigrams[:n]:
        print(f"{trigram}: {count}")
    

In [99]:
# Display the top 10 trigrams from the trigram_counts dictionary
display_top_trigrams(trigram_counts, n=100)

Top 100 trigrams:
 TH: 31638
THE: 28250
HE : 24863
ED : 12660
AND: 12613
ND : 12238
 AN: 11924
 OF: 9595
 TO: 9507
.  : 9048
ING: 8785
OF : 8682
ER : 8634
TO : 8576
 HE: 8369
NG : 8021
 IN: 7765
AS : 7400
AT : 7381
IS : 7183
HER: 6887
 HA: 6859
 A : 6810
RE : 6431
IN : 6351
D T: 6310
E T: 6080
 WA: 6012
 HI: 5758
E A: 5604
 BE: 5482
HIS: 5432
N T: 5357
ON : 5274
EN : 5208
E S: 5175
ERE: 5149
HAT: 5088
 WH: 5087
E W: 5031
 WI: 4899
 I : 4876
THA: 4803
WAS: 4788
S A: 4542
T T: 4512
YOU: 4475
OR : 4438
LL : 4402
 NO: 4357
 CO: 4345
ES : 4121
 IT: 4105
FOR: 4038
 YO: 3996
E O: 3994
D A: 3988
ME : 3956
LY : 3848
 FO: 3842
ENT: 3838
IT : 3787
UT : 3763
TH : 3748
E H: 3720
ITH: 3713
AN : 3698
 SH: 3594
F T: 3523
 ON: 3503
S T: 3497
WIT: 3467
VER: 3432
 MA: 3422
AD : 3410
TER: 3363
 WE: 3363
VE : 3340
D H: 3336
 AS: 3308
ALL: 3289
E I: 3278
THI: 3269
ION: 3252
T I: 3187
 SA: 3182
T A: 3173
 RE: 3161
E C: 3102
E M: 3100
LD : 3087
ST : 3072
   : 3059
N A: 3054
 ST: 3001
NT : 2970
HAD: 2922
E B: 

In [100]:
# END OF TASK 1