In [None]:
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

def read_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return file.read()

def preprocess_text(text):
    # Tokenize text and remove stopwords
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stopwords.words('english')]
    return words

def build_markov_chain(words, chain_length):
    markov_chain = {}
    for i in range(len(words) - chain_length):
        current_words = tuple(words[i:i+chain_length])
        next_word = words[i + chain_length]
        if current_words in markov_chain:
            markov_chain[current_words].append(next_word)
        else:
            markov_chain[current_words] = [next_word]
    return markov_chain

def generate_sentence(markov_chain, start_words, num_generated):
    current_words = tuple(start_words)
    generated_sentence = list(start_words)
    for _ in range(num_generated):
        if current_words in markov_chain:
            next_word = random.choice(markov_chain[current_words])
            generated_sentence.append(next_word)
            current_words = tuple(generated_sentence[-len(start_words):])
        else:
            break
    return ' '.join(generated_sentence)

def generate(filename: str, start_words: list[str], chain_length: int, num_generated: int) -> str:
    text = read_file(filename)
    words = preprocess_text(text)
    markov_chain = build_markov_chain(words, chain_length)
    return generate_sentence(markov_chain, start_words, num_generated)

# Example usage:
filename = "your_filename.txt"  # Provide the filename
start_words = ["start", "words"]  # Provide start words
chain_length = len(start_words)
num_generated = 50  # Number of words to generate
generated_text = generate(filename, start_words, chain_length, num_generated)
print(generated_text)
