In [2]:
# Step 1; Clean the text
import re
import json
import random 
from collections import defaultdict

def clean_text(text):
    # Remove all characters except letters (A-Z, a-z), spaces, and full stops
    cleaned_text = re.sub(r'[^A-Za-z. ]+', '', text)
    # Convert all letters to uppercase
    cleaned_text = cleaned_text.upper()
    return cleaned_text

# Read and clean the books
def read_and_clean_books(file_paths):
    all_text = ""
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            # Clean and concatenate the text
            all_text += clean_text(text)
    return all_text

# The paths to your five books
file_paths = [
    "Books/Cthulhu.txt",
    "Books/Dracula.txt",
    "Books/Frankenstein.txt",
    "Books/Mobydick.txt",
    "Books/Sherlock.txt"
]

# Clean all books and prepare the entire dataset
cleaned_text = read_and_clean_books(file_paths)

In [3]:
# Function to create the trigram model
def create_trigram(cleaned_text):
    """Generate a trigram model from the cleaned text."""
    trigram_counts = defaultdict(int)  # Initialize a dictionary to store trigram counts
    for i in range(len(cleaned_text) - 2):  # Loop to extract trigrams
        trigram = cleaned_text[i:i+3]  
        trigram_counts[trigram] += 1  # Count the trigram
    return dict(trigram_counts)  

In [4]:
# Testing Trigram Model
trigram_model = create_trigram(cleaned_text)

print("Trigrams:")
sorted_trigrams = sorted(trigram_model.items(), key=lambda x: x[1], reverse=True)
for trigram, count in list(trigram_model.items())[:10]: # Show the first 10 trigrams 
    print(f"{trigram}: {count}")

# Save the trigram model to a JSON file for use in Task 2
with open("trigram_model.json", "w") as f:
    json.dump(trigram_model, f)
print("\nTrigram model saved to 'trigram_model.json'")

Trigrams:
THE: 46017
HE : 37446
E C: 4945
 CA: 4769
CAL: 1203
ALL: 6929
LL : 8385
L O: 947
 OF: 16116
OF : 14886

Trigram model saved to 'trigram_model.json'


In [5]:
# Save the trigram model to a JSON file
with open("trigram_model.json", "w") as f:
    json.dump(trigram_model, f)

print("Trigram model saved to trigram_model.json")

Trigram model saved to trigram_model.json


In [6]:
def generate_text(trigram_model, start, length):
    # Generate text using a trigram model.
    generated_text = start 

    # Generates text until the length needed is reached.
    while len(generated_text) < length:
        last_two = generated_text[-2:] # Gets the last two characters
        print(f"Last two characters: {last_two}")

        # Find all the trigrams that start with the last two charecters 
        candidates = {trigram: count for trigram, count in trigram_model.items() if trigram.startswith(last_two)}
        print(f"Candidates: {candidates}")
        
        if not candidates:
             print("No candidates found.")
             break # If not candidates are found stop the program 
        
        total_count =sum(candidates.values())
        
        choices = [trigram[2] for trigram in candidates.keys()]  # Get the third character of each trigram
        weights = [count / total_count for count in candidates.values()]  # Calculate probabilities 
        print(f"Choices: {choices}, Weights: {weights}")

        # Randomly choose the next character based on the weights
        next_char = random.choices(choices, weights)[0]
        generated_text += next_char  # Add the chosen character to the text

        # Log progress every 1000 characters
        if len(generated_text) % 1000 == 0:
            print(f"Generated length: {len(generated_text)}")
            
    return generated_text # adding to fix commit message

In [7]:
# Start string and desired length for generated text
start_string = "TH"
generated_length = 10000

# Generate the text using the trigram model
generated_text = generate_text(trigram_model, start_string, generated_length)

# Save the generated text to a file
with open("generated_text.txt", "w") as f:
    f.write(generated_text)
print("Generated text saved to 'generated_text.txt'")



Last two characters: TH
Candidates: {'THE': 46017, 'THU': 569, 'THI': 6242, 'THA': 9994, 'THD': 49, 'TH ': 6203, 'THO': 3037, 'TH.': 277, 'THF': 63, 'THR': 1532, 'THY': 315, 'THQ': 15, 'THS': 270, 'THH': 68, 'THG': 17, 'THL': 83, 'THM': 42, 'THT': 110, 'THC': 21, 'THW': 87, 'THV': 1, 'THB': 26, 'THP': 13, 'THN': 13, 'THJ': 3, 'THK': 5}
Choices: ['E', 'U', 'I', 'A', 'D', ' ', 'O', '.', 'F', 'R', 'Y', 'Q', 'S', 'H', 'G', 'L', 'M', 'T', 'C', 'W', 'V', 'B', 'P', 'N', 'J', 'K'], Weights: [0.612971547314578, 0.007579390451832907, 0.08314684569479966, 0.13312553282182438, 0.0006527067348678602, 0.08262734441602729, 0.040454497016197785, 0.0036897911338448423, 0.0008391943734015345, 0.02040707587382779, 0.004195971867007673, 0.0001998081841432225, 0.003596547314578005, 0.0009057971014492754, 0.00022644927536231884, 0.001105605285592498, 0.000559462915601023, 0.0014652600170502984, 0.0002797314578005115, 0.0011588874680306905, 1.3320545609548167e-05, 0.0003463341858482523, 0.0001731670929241261