In [21]:
import re
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) #keep words/numbers/whitespace
    return text.split()

#test preprocessing
print(preprocess("Hello! How's the weather?"))

['hello', 'hows', 'the', 'weather']


In [None]:
import re
from collections import defaultdict
from preprocess import preprocess

class NGramModel:
    def __init__(self, n=3):
        self.n = n
        self.ngrams = defaultdict(lambda: defaultdict(int))
        self.vocab = set()

    def train(self, corpus):
        #collect n-grams and vocabulary
        print(f"Training on {len(corpus)} sentences...")
        for sentence in corpus:
            tokens = preprocess(sentence)
            if not tokens:
                continue
            self.vocab.update(tokens)
            print(f"Processing sentence: {tokens}")  # Debug

            for i in range(len(tokens)-self.n+1):
                context = tuple(tokens[i:i+self.n-1])
                next_word = tokens[i+self.n-1]
                self.ngrams[context][next_word]+=1
                print(f"Added n-gram: {context} -> {next_word}")  # Debug

        #add laplace smoothing
        self._apply_smoothing()
        print(f"Vocabulary size: {len(self.vocab)}")
        print(f"Total contexts: {len(self.ngrams)}")

    def _apply_smoothing(self, alpha=0.1):
        vocab_size = len(self.vocab)
        for context in self.ngrams:
            total = sum(self.ngrams[context].values())+alpha*vocab_size
            for word in self.vocab:
                self.ngrams[context][word] = (self.ngrams[context].get(word, 0) + alpha)/total

    def prune_model(self, min_count=2):
        for context in list(self.ngrams.keys()):
            self.ngrams[context] = {
                k: v for k, v in self.ngrams[context].items() 
                if v >= min_count
            }
            if not self.ngrams[context]:
                del self.ngrams[context]


    #enhanced prediction with backoff
    def predict(self, text, top_k=3):
        tokens = preprocess(text)
        print(f"Input tokens: {tokens}")  # Debug

        # Handle context window
        context = tuple(tokens[-(self.n-1):]) if len(tokens) >= self.n-1 else tuple(tokens)
        print(f"Initial context: {context}")  # Debug


        #backoff mechanism
        while len(context)>0:
            if context in self.ngrams:
                suggestions = sorted(self.ngrams[context].items(),key=lambda x: -x[1])[:top_k]
                return [word for word, prob in suggestions]
            context = context[1:] if len(context) > 1 else tuple()
        print(f"Backing off to: {context}")  # Debug
        #remove oldest word for backoff
            
        
        #final fallback to unigrams (most common words)
# Final fallback to most common words
        common_words = sorted(self.vocab, 
                            key=lambda x: sum(m[x] for m in self.ngrams.values()),
                            reverse=True)[:top_k]
        return common_words

In [23]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

with open('textfile.txt', 'r', encoding='utf-8') as file:
    corpus = sent_tokenize(file.read())  # Split into proper sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:


#initialize and train model
model = NGramModel(n=3)
model.train(corpus=corpus)

#test predictions
print(model.predict("I enjoy"))

print(model.predict("science"))

print(model.predict("unknown text"))

Training on 2355 sentences...
Processing sentence: ['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare', '1599', 'actus', 'primus']


KeyError: ('the', 'tragedie')

In [None]:
# File: interface.py
def interactive_demo(model):
    while True:
        user_input = input("\nType your text (press 'q' to quit):\n")
        if user_input.lower() == 'q':
            break
            
        predictions = model.predict(user_input)
        print(f"Suggestions: {predictions}")

# Run the demo
if __name__ == "__main__":
    interactive_demo(model)

save model


In [None]:
import pickle

# Save
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load
# with open('model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

AttributeError: Can't pickle local object 'NGramModel.__init__.<locals>.<lambda>'