# Predicting Next Word using N-gram

In [1]:
# Import necessary libraries

import nltk
from nltk import bigrams, trigrams
from nltk.corpus import reuters
from collections import defaultdict

In [4]:
# Download necessary NLTK resources

nltk.download('reuters')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package reuters to /Users/bengj/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/bengj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/bengj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
# Tokenization

words = nltk.word_tokenize(' '.join(reuters.words()))

In [7]:
# Create trigrams

tri_grams = list(trigrams(words))

In [8]:
# Build a trigram model

model = defaultdict(lambda: defaultdict(lambda: 0))

In [10]:
# Count frequency of co-occurrence
# It shows often w3 follows w1 and w2
# This loop counts the occurrences of each third word (w3) given a word pair (w1, w2)

for w1, w2, w3 in tri_grams:
    model[(w1, w2)][w3] += 1

In [11]:
# Transform the counts into probabilities
# For each bigram (w1, w2), this converts the count of each possible w3 into a probability

for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [12]:
# Function to predict the next word
# Takes a bigram (w1, w2) and uses the trigram model to find the most probable third word.

def predict_next_word(w1, w2):
    next_word = model[w1, w2]
    if(next_word):
        predicted_word = max(next_word, key = next_word.get) # Choose the most likely word to occur next
        return predicted_word
    else:
        return "No prediction available"

In [13]:
print("Next word: ", predict_next_word('I', 'am'))

Next word:  sure


In [14]:
print("Next word: ", predict_next_word('Let', 'me'))

Next word:  No prediction available


In [19]:
print("Next word: ", predict_next_word('in', 'front'))

Next word:  of
