In [None]:
# Import Required Libraries
import pickle
import os
import random
from collections import defaultdict, Counter
import math

# Preprocessing Improvements
def preprocess_text(file_path):
    """Load and clean text data."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
    return text.replace('\n', ' ').replace('\r', '').replace('\ufeff', '')

# Load and preprocess multiple files
file_paths = ["data1.txt", "data2.txt", "data3.txt", "data4.txt"]
all_data = ""

for file_path in file_paths:
    if os.path.exists(file_path):
        data = preprocess_text(file_path)
        all_data += data + " "
        print(f"Processed {file_path}. Sample: {data[:200]}")
    else:
        print(f"Warning: {file_path} not found. Skipping...")

# Check data size to prevent memory issues
print("Total preprocessed text length:", len(all_data))
if len(all_data) > 10_000_000:  # Arbitrary threshold (10MB of text)
    print("Warning: Input data is very large. Consider reducing it for stability.")
print("Sample of combined preprocessed text:", all_data[:500])

# Optimized Enhanced Markov Chain
class EnhancedMarkovChain:
    def __init__(self, max_order=3, smoothing_alpha=1.0, beam_width=3):
        self.max_order = max_order
        self.smoothing_alpha = smoothing_alpha
        self.beam_width = beam_width
        self.transitions = [defaultdict(Counter) for _ in range(max_order + 1)]
        self.vocab = set()
        self.probs = [defaultdict(dict) for _ in range(max_order + 1)]

    def train(self, text):
        """Train the model with multiple orders and optimized probability computation."""
        try:
            words = text.split()
            self.vocab.update(words)
            vocab_size = len(self.vocab)
            print(f"Vocabulary size: {vocab_size}")

            # Train for each order
            for order in range(self.max_order + 1):
                transitions = self.transitions[order]
                for i in range(len(words) - order):
                    state = tuple(words[i:i + order]) if order > 0 else tuple()
                    next_word = words[i + order]
                    transitions[state][next_word] += 1

            # Precompute probabilities only for observed transitions
            for order in range(self.max_order + 1):
                transitions = self.transitions[order]
                probs = self.probs[order]
                for state in transitions:
                    total_count = sum(transitions[state].values())
                    for next_word in transitions[state]:  # Only observed next words
                        probs[state][next_word] = ((transitions[state][next_word] + self.smoothing_alpha) /
                                                  (total_count + self.smoothing_alpha * vocab_size))
                    # Add a small probability for unseen words
                    probs[state]['<UNK>'] = self.smoothing_alpha / (total_count + self.smoothing_alpha * vocab_size)

            print(f"Trained with {len(self.vocab)} unique words and {sum(len(t) for t in self.transitions)} transitions.")
        except MemoryError:
            print("Error: Out of memory during training. Try reducing max_order or input size.")
            raise

    def generate(self, seed_text, num_words=3):
        """Generate text using beam search with error handling."""
        try:
            words = seed_text.split()
            if not words:
                words = [random.choice(list(self.vocab))]

            beam = [(0.0, words)]  # (log_prob, sequence)

            for _ in range(num_words):
                new_beam = []
                for log_prob, seq in beam:
                    state = tuple(seq[-self.max_order:]) if len(seq) >= self.max_order else tuple(seq)
                    for order in range(min(self.max_order, len(state)), -1, -1):
                        curr_state = state[-order:] if order > 0 else tuple()
                        if curr_state in self.transitions[order]:
                            probs = self.probs[order][curr_state]
                            # Add top beam_width candidates
                            for next_word, prob in sorted(probs.items(), key=lambda x: -x[1])[:self.beam_width]:
                                new_log_prob = log_prob + math.log(prob)
                                new_seq = seq + [next_word if next_word != '<UNK>' else random.choice(list(self.vocab))]
                                new_beam.append((new_log_prob, new_seq))
                            break
                    else:
                        # Fallback to uniform probability over vocab
                        uniform_prob = 1.0 / len(self.vocab)
                        for next_word in random.sample(list(self.vocab), self.beam_width):
                            new_log_prob = log_prob + math.log(uniform_prob)
                            new_seq = seq + [next_word]
                            new_beam.append((new_log_prob, new_seq))

                beam = sorted(new_beam, key=lambda x: -x[0])[:self.beam_width]

            return " ".join(beam[0][1][-num_words:])
        except Exception as e:
            print(f"Error during generation: {str(e)}")
            return "Generation failed"

# Train and test the optimized model
try:
    markov = EnhancedMarkovChain(max_order=3, smoothing_alpha=1.0, beam_width=3)
    markov.train(all_data)
    print("Enhanced Markov Chain trained!")

    # Test with a sample seed
    test_seed = "The sun"
    print(f"Seed: {test_seed}")
    print(f"Prediction: {markov.generate(test_seed, num_words=3)}")
except Exception as e:
    print(f"Session crashed: {str(e)}")

test_sentences = [
    "The sun began to",           # 1
    "She walked through the",     # 2g
    "He decided to take",         # 3g
    "In the middle of",           # 4
    "The old house was",          # 5
    "They watched the stars",     # 6
    "A loud noise came",          # 7
    "The teacher explained the",  # 8
    "After a long day",           # 9
    "The cat jumped onto",        # 10
    "A small bird flew",          # 11
    "He opened the door",         # 12
    "She smiled at her",          # 13
    "The dog barked at",          # 14
    "We sat by the",              # 15
    "The rain started to",        # 16
    "He wrote a letter",          # 17
    "She painted the walls",      # 18
    "The children played in",     # 19
    "I forgot to bring",          # 20
    "The clock struck twelve",    # 21
    "They climbed the tall",      # 22
    "A bright light shone",       # 23
    "The car stopped at",         # 24
    "She whispered in his",       # 25
    "He tripped on the",          # 26
    "The baby laughed at",        # 27
    "We listened to the",         # 28
    "The wind blew through",      # 29
    "She dropped her phone",      # 30
    "He shouted across the",      # 31
    "The book fell off",          # 32
    "They danced under the",      # 33
    "A strange sound echoed",     # 34
    "The teacher handed out",     # 35
    "I saw a shooting",           # 36
    "She tied her shoes",         # 37
    "He drove to the",            # 38
    "The flowers bloomed in",     # 39
    "We waited for the",          # 40
    "The moon glowed brightly",   # 41
    "They built a sandcastle",    # 42
    "A cold breeze swept",        # 43
    "The chef prepared a",        # 44
    "She forgot her lines",       # 45
    "He jumped into the",         # 46
    "The bell rang at",           # 47
    "They laughed at the",        # 48
    "A tiny frog hopped",         # 49
    "The sky turned dark",        # 50
    "She picked up the",          # 51
    "He stared at the",           # 52
    "The train arrived at",       # 53
    "We hiked up the",            # 54
    "The fire crackled in",       # 55
    "She waved to her",           # 56
    "He fixed the broken",        # 57
    "The kids drew with",         # 58
    "I heard a loud",             # 59
    "The river flowed through",   # 60
    "They sang a happy",          # 61
    "A black cat crossed",        # 62
    "The waiter brought the",     # 63
    "She cleaned the dusty",      # 64
    "He planted a tree",          # 65
    "The snow fell gently",       # 66
    "We watched a funny",         # 67
    "The phone rang during",      # 68
    "She baked a delicious",      # 69
    "He climbed over the",        # 70
    "The stars twinkled in",      # 71
    "They ran to the",            # 72
    "A heavy rain poured",        # 73
    "The librarian sorted the",   # 74
    "I tripped on a",             # 75
    "The puppy chased its",       # 76
    "She wrote in her",           # 77
    "He pointed at the",          # 78
    "The leaves fell from",       # 79
    "We swam in the",             # 80
    "The thunder roared in",      # 81
    "They shared a warm",         # 82
    "A bright rainbow appeared",  # 83
    "The farmer fed the",         # 84
    "She sewed a new",            # 85
    "He kicked the soccer",       # 86
    "The clock ticked loudly",    # 87
    "They whispered in the",      # 88
    "A small boat sailed",        # 89
    "The sun set behind",         # 90
    "She laughed at the",         # 91
    "He carried a heavy",         # 92
    "The birds chirped in",       # 93
    "We painted the old",         # 94
    "The ice melted in",          # 95
    "She called her best",        # 96
    "He drew a picture",          # 97
    "The wind howled through",    # 98
    "They jumped into the",       # 99
    "A loud crash woke",          # 100
]

for sentence in test_sentences:
    print(f"\nInput: {sentence}")
    print(f"Prediction: {markov.generate(sentence, num_words=3)}")

Processed data1.txt. Sample: One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.  He lay on his armour-like back, and if he lifted his head a little he could s
Processed data2.txt. Sample:  Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle  This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.  You may copy it, giv
Processed data3.txt. Sample: The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. People were out enjoying the beautiful weather, some sitting in the park, others taking a 
Processed data4.txt. Sample:     Chapter 1        It is a truth universally acknowledged, that a single man in       possession of a good fortune, must be in want of a wife.        However little known the feelings or views of su
Total preprocessed text length: 1797163
Sample of combined preprocessed text: On