In [6]:
# Function to read and clean text files
def load_and_clean_text(file_paths):
    """
    Load and clean text from given file paths.
    The cleaning process includes:
    - Removing non-ASCII characters
    - Converting all letters to uppercase
    - Removing punctuation except for full stops
    - Stripping leading and trailing whitespace
    """
    cleaned_text = ''

     # Iterate over file paths
    for path in file_paths:
        # Open file for reading
        # Use utf-8 encoding to support non-ASCII characters
        with open(path, 'r', encoding='utf-8') as file:
            # Read the file content
            text = file.read()
            # Remove unwanted characters and convert to uppercase
            # Only keep alphabetic characters, spaces, and full stops
            # ''.join() creates a new string by joining the characters
            cleaned_text += ''.join(
                ch for ch in text.upper() if ch.isalpha() or ch in [' ', '.']
            )

      # Strip any extra whitespace from the beginning and end
    return cleaned_text.strip()


In [7]:
# File paths
file_paths = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt', 'file5.txt']

In [8]:
# Load and clean text
cleaned_text = load_and_clean_text(file_paths)

In [14]:
# Function to generate trigrams and count their occurrences
def generate_trigrams(text):
    """
    Generate trigrams from the cleaned text and count their occurrences.
    Returns a dictionary where keys are trigrams and values are their counts.
    """
    # Dictionary to store trigram counts
    trigram_counts = {}

    # Loop through the text to extract trigrams 
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Extract the trigram starting at index 'i'
        
        # If trigram already exists in the dictionary, increment its count
        if trigram in trigram_counts:
            trigram_counts[trigram] += 1
        else:
            # If the trigram is encountered for the first time, initialize its count to 1
            trigram_counts[trigram] = 1

     # Return the dictionary containing trigram counts
    return trigram_counts

In [16]:
# Generate trigrams from the cleaned text provided
# 'cleaned_text' is expected to be a string, which has already been processed 
# trigram_counts holds dictionary where the keys are trigrams and the value are the counts of occurrences
trigram_counts = generate_trigrams(cleaned_text)

In [23]:
# Function to display the top N trigrams
def display_top_trigrams(trigram_counts, n=100):
    """
    Display the top N trigrams based on their counts.
    
    Args:
        trigram_counts (dict): A dictionary where the keys are trigrams (3-character sequences)
                               and the values are their corresponding counts (occurrences).
        n (int, optional): The number of top trigrams to display.
    """
    
    # Sort the trigrams by their counts in descending order (most frequent first)
    # sorted_trigrams will be a list of tuples (trigram, count), sorted by count
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)
