In [None]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from collections import defaultdict, Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('treebank')
nltk.download('universal_tagset')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
class HMMNextWordPredictor:
    def __init__(self):
        # Training corpus
        self.corpus = nltk.corpus.brown.sents(categories=['news'])

        # Dictionary to store transition probabilities between POS tags
        self.pos_transitions = defaultdict(Counter)

        # Dictionary to store emission probabilities (POS tags to words)
        self.pos_emissions = defaultdict(Counter)

        # Dictionary to store POS tag sequences and their next words
        self.pos_sequence_to_next_words = defaultdict(Counter)

        # Train the model
        self._train()

    def _train(self):
        """Train the HMM model on the corpus."""
        print("Training the model...")

        for sentence in self.corpus:
            # Skip short sentences
            if len(sentence) < 3:
                continue

            # Get POS tags for the sentence
            tagged_sentence = pos_tag(sentence)

            # Extract transitions and emissions
            for i in range(len(tagged_sentence) - 1):
                current_word, current_pos = tagged_sentence[i]
                next_word, next_pos = tagged_sentence[i + 1]

                # Convert to lowercase for consistency
                current_word = current_word.lower()
                next_word = next_word.lower()

                # Update transition counts (current POS -> next POS)
                self.pos_transitions[current_pos][next_pos] += 1

                # Update emission counts (POS -> word)
                self.pos_emissions[current_pos][current_word] += 1

                # Store POS sequence to next word mapping
                if i > 0:
                    prev_word, prev_pos = tagged_sentence[i-1]
                    pos_sequence = (prev_pos, current_pos)
                    self.pos_sequence_to_next_words[pos_sequence][next_word] += 1

        # Convert counts to probabilities
        self._normalize_probabilities()

        print("Model training complete!")

    def _normalize_probabilities(self):
        """Convert frequency counts to probability distributions."""
        # Normalize transition probabilities
        for pos, transitions in self.pos_transitions.items():
            total = sum(transitions.values())
            for next_pos in transitions:
                transitions[next_pos] /= total

        # Normalize emission probabilities
        for pos, emissions in self.pos_emissions.items():
            total = sum(emissions.values())
            for word in emissions:
                emissions[word] /= total

        # Normalize POS sequence to next word probabilities
        for pos_seq, next_words in self.pos_sequence_to_next_words.items():
            total = sum(next_words.values())
            for word in next_words:
                next_words[word] /= total

    def predict_next_word(self, text, n=5):
        """
        Predict the most likely next words given the input text.

        Args:
            text (str): Input text
            n (int): Number of predictions to return

        Returns:
            list: Top n predicted words with their probabilities
        """
        # Tokenize and get POS tags
        tokens = word_tokenize(text)
        if len(tokens) < 2:
            return [("Need at least 2 words for prediction", 0)]

        tagged_tokens = pos_tag(tokens)

        # Get the last two POS tags
        last_pos = tagged_tokens[-1][1]
        second_last_pos = tagged_tokens[-2][1] if len(tagged_tokens) > 1 else None
        pos_sequence = (second_last_pos, last_pos)

        predictions = []

        # Method 1: Use POS sequence for prediction
        if pos_sequence in self.pos_sequence_to_next_words:
            candidate_words = self.pos_sequence_to_next_words[pos_sequence]
            predictions.extend([(word, prob) for word, prob in candidate_words.most_common(n)])

        # Method 2: Use transition and emission probabilities
        if not predictions or len(predictions) < n:
            # Find possible next POS tags
            next_pos_probs = self.pos_transitions[last_pos]

            for next_pos, trans_prob in next_pos_probs.items():
                # Find words for this POS tag
                for word, emit_prob in self.pos_emissions[next_pos].items():
                    # Calculate joint probability
                    joint_prob = trans_prob * emit_prob
                    predictions.append((word, joint_prob))

            # Sort by probability and take top n
            predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]

        return predictions[:n]

    def interactive_prediction(self):
        """Interactive console for next word prediction."""
        print("Welcome to HMM-based Next Word Prediction!")
        print("Type a sentence and get predictions for the next word.")
        print("Type 'exit' to quit.")

        while True:
            text = input("\nEnter text: ")
            if text.lower() == 'exit':
                break

            predictions = self.predict_next_word(text)

            print("\nPredicted next words:")
            for i, (word, prob) in enumerate(predictions, 1):
                print(f"{i}. {word} (probability: {prob:.4f})")


# Example usage
if __name__ == "__main__":
    predictor = HMMNextWordPredictor()

    # Example prediction
    test_text = "The president of the"
    predictions = predictor.predict_next_word(test_text)

    print(f"\nInput: '{test_text}'")
    print("Predicted next words:")
    for word, prob in predictions:
        print(f"- {word} (probability: {prob:.4f})")

    # Start interactive mode
    predictor.interactive_prediction()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Training the model...
Model training complete!

Input: 'The president of the'
Predicted next words:
- new (probability: 0.0102)
- first (probability: 0.0099)
- state (probability: 0.0091)
- year (probability: 0.0071)
- `` (probability: 0.0069)
Welcome to HMM-based Next Word Prediction!
Type a sentence and get predictions for the next word.
Type 'exit' to quit.

Enter text: how are

Predicted next words:
1. you (probability: 1.0000)
2. the (probability: 0.0734)
3. to (probability: 0.0601)
4. a (probability: 0.0243)
5. , (probability: 0.0240)

Enter text: are you

Predicted next words:
1. will (probability: 0.1163)
2. . (probability: 0.0698)
3. can (probability: 0.0698)
4. have (probability: 0.0698)
5. '' (probability: 0.0465)

Enter text: are you 

Predicted next words:
1. will (probability: 0.1163)
2. . (probability: 0.0698)
3. can (probability: 0.0698)
4. have (probability: 0.0698)
5. '' (probability: 0.0465)

Enter text: what is

Predicted next words:
1. been (probability: 0.1200)
2.

KeyboardInterrupt: Interrupted by user