In [None]:
import os
import random
from collections import defaultdict, Counter
import math

# Data cleaning
def preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
    return text.replace('\n', ' ').replace('\r', '').replace('\ufeff', '')

# Collect data from multiple files
file_paths = ["data1.txt", "data2.txt", "data3.txt", "data4.txt"]
all_data = ""

for file_path in file_paths:
    if os.path.exists(file_path):
        data = preprocess_text(file_path)
        all_data += data + " "
        print(f"Processed {file_path}. Sample: {data[:100]}")
    else:
        print(f"Warning: {file_path} not found. Skipping...")

# Check data size to prevent memory issues
print("Total preprocessed text length:", len(all_data))
if len(all_data) > 1_000_000_000:
    print("Warning: Input data is very large. Consider reducing it for stability.")
print("Sample of combined preprocessed text:", all_data[:100])

# Markov Chain Algo
class EnhancedMarkovChain:
    def __init__(self, max_order=3, smoothing_alpha=1.0, beam_width=3):
        self.max_order = max_order
        self.smoothing_alpha = smoothing_alpha
        self.beam_width = beam_width
        self.transitions = [defaultdict(Counter) for _ in range(max_order + 1)]
        self.vocab = set()
        self.probs = [defaultdict(dict) for _ in range(max_order + 1)]

    def train(self, text):
        try:
            words = text.split()
            self.vocab.update(words)
            vocab_size = len(self.vocab)
            print(f"Vocabulary size: {vocab_size}")

            for order in range(self.max_order + 1):
                transitions = self.transitions[order]
                for i in range(len(words) - order):
                    state = tuple(words[i:i + order]) if order > 0 else tuple()
                    next_word = words[i + order]
                    transitions[state][next_word] += 1

            for order in range(self.max_order + 1):
                transitions = self.transitions[order]
                probs = self.probs[order]
                for state in transitions:
                    total_count = sum(transitions[state].values())
                    for next_word in transitions[state]:
                        probs[state][next_word] = ((transitions[state][next_word] + self.smoothing_alpha) /
                                                  (total_count + self.smoothing_alpha * vocab_size))
                    probs[state]['<UNK>'] = self.smoothing_alpha / (total_count + self.smoothing_alpha * vocab_size)

            print(f"Trained with {len(self.vocab)} unique words and {sum(len(t) for t in self.transitions)} transitions.")
        except MemoryError:
            print("Error: Out of memory during training. Try reducing max_order or input size.")
            raise

    def generate(self, seed_text, num_words=3):
        """Generate text using beam search with error handling."""
        try:
            words = seed_text.split()
            if not words:
                words = [random.choice(list(self.vocab))]

            beam = [(0.0, words)]

            for _ in range(num_words):
                new_beam = []
                for log_prob, seq in beam:
                    state = tuple(seq[-self.max_order:]) if len(seq) >= self.max_order else tuple(seq)
                    for order in range(min(self.max_order, len(state)), -1, -1):
                        curr_state = state[-order:] if order > 0 else tuple()
                        if curr_state in self.transitions[order]:
                            probs = self.probs[order][curr_state]
                            for next_word, prob in sorted(probs.items(), key=lambda x: -x[1])[:self.beam_width]:
                                new_log_prob = log_prob + math.log(prob)
                                new_seq = seq + [next_word if next_word != '<UNK>' else random.choice(list(self.vocab))]
                                new_beam.append((new_log_prob, new_seq))
                            break
                    else:
                        uniform_prob = 1.0 / len(self.vocab)
                        for next_word in random.sample(list(self.vocab), self.beam_width):
                            new_log_prob = log_prob + math.log(uniform_prob)
                            new_seq = seq + [next_word]
                            new_beam.append((new_log_prob, new_seq))

                beam = sorted(new_beam, key=lambda x: -x[0])[:self.beam_width]

            return " ".join(beam[0][1][-num_words:])
        except Exception as e:
            print(f"Error during generation: {str(e)}")
            return "Generation failed"

try:
    markov = EnhancedMarkovChain(max_order=3, smoothing_alpha=1.0, beam_width=3)
    markov.train(all_data)
    print("Enhanced Markov Chain trained!")

    test_seed = "The sun"
    print(f"Seed: {test_seed}")
    print(f"Prediction: {markov.generate(test_seed, num_words=3)}")
except Exception as e:
    print(f"Session crashed: {str(e)}")

test_sentences = [
    "She walked through the",     # 1
    "He decided to take",         # 2
    "He opened the door",         # 3
    "She smiled at her",          # 4
    "A strange sound echoed",     # 5
    "The bell rang at",           # 6
    "The old house was",          # 7
    "A loud noise came",          # 8
    "The teacher explained the",  # 9
    "The cat jumped onto",        # 10
    "The dog barked at",          # 11
    "She painted the walls",      # 12
    "The car stopped at",         # 13
    "He shouted across the",      # 14
    "He drove to the",            # 15
    "The moon glowed brightly",   # 16
    "A cold breeze swept",        # 17
    "He jumped into the",         # 18
    "He stared at the",           # 19
    "The fire crackled in",       # 20
    "I heard a loud",             # 21
    "The river flowed through",   # 22
    "He climbed over the",        # 23
    "A small boat sailed",        # 24
    "The sun set behind",         # 25
]

for sentence in test_sentences:
    print(f"\nInput: {sentence}")
    print(f"Prediction: {markov.generate(sentence, num_words=3)}")

Processed data1.txt. Sample: One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed in
Processed data2.txt. Sample:  Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle  This eBook is for the
Processed data3.txt. Sample: The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the ta
Processed data4.txt. Sample:     Chapter 1        It is a truth universally acknowledged, that a single man in       possession o
Total preprocessed text length: 1797163
Sample of combined preprocessed text: One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed in
Vocabulary size: 34751
Trained with 34751 unique words and 479778 transitions.
Enhanced Markov Chain trained!
Seed: The sun
Prediction: was shining with

Input: She walked through the
Prediction: dense vegetation, home

Input: He decided to take
Prediction: a train to

Input: He opened the door
Predi

In [None]:
try:
    while True:
        user_input = input("\nEnter a seed sentence (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            print("Exiting prediction mode. Goodbye!")
            break
        print(f"Prediction: {markov.generate(user_input, num_words=1)}")
except Exception as e:
    print(f"Interactive session crashed: {str(e)}")



Enter a seed sentence (or type 'exit' to quit): The river flowed through
Prediction: the

Enter a seed sentence (or type 'exit' to quit): the sun is
Prediction: for

Enter a seed sentence (or type 'exit' to quit): I am 
Prediction: not

Enter a seed sentence (or type 'exit' to quit): what are you
Prediction: doing?

Enter a seed sentence (or type 'exit' to quit): what are you
Prediction: doing?

Enter a seed sentence (or type 'exit' to quit): exit
Exiting prediction mode. Goodbye!
