In [79]:
# imports 
import random


In [80]:
# Function to load processed text from a file
def read_processed_text(file_path):
    """Read text data from a file."""
    with open(file_path, 'r') as file:  # Open file in read mode
        return file.read()  # Read and return the content as a string

# File path to the processed text file
file_path = 'output.txt'  # Ensure this file exists in the same directory
text = read_processed_text('Books/Frankenstein.txt')  # Load the text data


In [81]:
# Function to build a trigram probability model from input text
# The model maps bigrams to the probabilities of the next letter
def build_trigram_model(text):
    """Builds a trigram probability model from text."""
    trigram_counts = {}  # Dictionary to store trigram counts
    for i in range(len(text) - 2):  # Loop through text to extract trigrams
        bigram = text[i:i+2]  # Extract the bigram
        next_letter = text[i+2]  # Extract the third letter
        if bigram not in trigram_counts:  # Initialize dictionary for new bigram
            trigram_counts[bigram] = {}
        if next_letter not in trigram_counts[bigram]:  # Initialize count for new letter
            trigram_counts[bigram][next_letter] = 0
        trigram_counts[bigram][next_letter] += 1  # Increment count

    # Convert counts to probabilities
    probability_model = {}
    for bigram, next_letters in trigram_counts.items():
        total_count = sum(next_letters.values())  # Total occurrences of the bigram
        probability_model[bigram] = {
            letter: count / total_count for letter, count in next_letters.items()
        }
    return probability_model  # Return the probability model

# Build the trigram model using the processed text
probability_model = build_trigram_model(text)


In [82]:
# Function to generate text using the trigram model
# It starts with a bigram and predicts the next letters
def generate_text(model, start_bigram, length=7500):
    """Generates text using a trigram model."""
    result = list(start_bigram)  # Initialize result with the starting bigram
    current_bigram = start_bigram  # Set the initial bigram

    for _ in range(length - 2):  # Generate text of the specified length
        if current_bigram not in model or not model[current_bigram]:
            # Restart with a random bigram if no continuation exists
            current_bigram = random.choice(list(model.keys()))
        next_letter = random.choices(
            list(model[current_bigram].keys()),  # Possible next letters
            list(model[current_bigram].values())  # Corresponding probabilities
        )[0]
        result.append(next_letter)  # Append the next letter to the result
        current_bigram = current_bigram[-1] + next_letter  # Update the current bigram

    return "".join(result)  # Return the generated text as a string

# Generate text for all bigrams
all_generated_texts = {}
for bigram in probability_model.keys():  # Iterate over all bigrams in the model
    all_generated_texts[bigram] = generate_text(probability_model, bigram, length=7500)


In [83]:
# Function to load a list of valid English words from a file (words.txt)
def load_words(file_path):
    """Load valid English words from a file."""
    with open(file_path, 'r') as file:  # Open the file in read mode
        return set(file.read().upper().splitlines())  # Convert words to uppercase

# Function to analyze the percentage of valid words in generated text
def analyze_text(generated_text, valid_words):
    """Analyze the percentage of valid English words in the generated text."""
    words = generated_text.split()  # Split the text into words
    valid_word_count = sum(1 for word in words if word in valid_words)  # Count valid words
    total_word_count = len(words)  # Total number of words
    if total_word_count == 0:
        return 0  # Avoid division by zero
    percentage_valid = (valid_word_count / total_word_count) * 100  # Calculate percentage
    print(f"Valid English words: {valid_word_count} / {total_word_count} ({percentage_valid:.2f}%)")
    return percentage_valid

# Load the valid English words from 'words.txt'
words_file = 'words.txt'  # Ensure this file exists in the same directory
valid_words = load_words(words_file)

# Analyze the text generated for each bigram
for bigram, generated_text in all_generated_texts.items():
    print(f"Analyzing text starting with bigram '{bigram}'")
    percentage_valid = analyze_text(generated_text, valid_words)


Analyzing text starting with bigram '﻿T'
Valid English words: 47 / 1322 (3.56%)
Analyzing text starting with bigram 'Th'
Valid English words: 48 / 1338 (3.59%)
Analyzing text starting with bigram 'he'
Valid English words: 49 / 1332 (3.68%)
Analyzing text starting with bigram 'e '
Valid English words: 48 / 1360 (3.53%)
Analyzing text starting with bigram ' P'
Valid English words: 59 / 1341 (4.40%)
Analyzing text starting with bigram 'Pr'
Valid English words: 54 / 1331 (4.06%)
Analyzing text starting with bigram 'ro'
Valid English words: 40 / 1345 (2.97%)
Analyzing text starting with bigram 'oj'
Valid English words: 44 / 1343 (3.28%)
Analyzing text starting with bigram 'je'
Valid English words: 45 / 1327 (3.39%)
Analyzing text starting with bigram 'ec'
Valid English words: 51 / 1322 (3.86%)
Analyzing text starting with bigram 'ct'
Valid English words: 57 / 1352 (4.22%)
Analyzing text starting with bigram 't '
Valid English words: 49 / 1306 (3.75%)
Analyzing text starting with bigram ' G'

In [84]:
# Function to save the trigram probability model to a JSON file
def save_model_to_json(model, file_path):
    """Save a given model to a JSON file."""
    with open(file_path, 'w') as file:  # Open file in write mode
        file.write("{\n")  # Begin JSON object
        for bigram, next_letters in model.items():  # Iterate over bigrams
            file.write(f'  "{bigram}": {next_letters},\n')  # Write each bigram and its data
        file.write("}\n")  # End JSON object
    print(f"Model saved to {file_path}")

# Save the trigram model to a JSON file
output_file = 'trigrams.json'
save_model_to_json(probability_model, output_file)


Model saved to trigrams.json
