N-gram Language Model for predicting the next word in a sentence. The model is based on the frequency of n-grams extracted from a text corpus. We will build an N-gram model using bigram or trigram for simplicity, but it can be generalized to any N.

In [1]:
# Importing necessary libraries
import nltk
from nltk import ngrams
from collections import defaultdict, Counter
import random

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt')

# Sample text corpus (can be replaced with any large text corpus)
corpus = """
Machine learning is the study of computer algorithms that improve automatically through experience.
It is seen as a subset of artificial intelligence. Machine learning algorithms build a model based on
sample data, known as training data, in order to make predictions or decisions without being explicitly
programmed to do so.
"""

# Tokenizing the corpus into sentences and words
def tokenize_text(corpus):
    sentences = nltk.sent_tokenize(corpus)
    tokens = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]
    return tokens

tokens = tokenize_text(corpus)

print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[['machine', 'learning', 'is', 'the', 'study', 'of', 'computer', 'algorithms', 'that', 'improve', 'automatically', 'through', 'experience', '.'], ['it', 'is', 'seen', 'as', 'a', 'subset', 'of', 'artificial', 'intelligence', '.'], ['machine', 'learning', 'algorithms', 'build', 'a', 'model', 'based', 'on', 'sample', 'data', ',', 'known', 'as', 'training', 'data', ',', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'do', 'so', '.']]


In [2]:
print(tokens)

[['machine', 'learning', 'is', 'the', 'study', 'of', 'computer', 'algorithms', 'that', 'improve', 'automatically', 'through', 'experience', '.'], ['it', 'is', 'seen', 'as', 'a', 'subset', 'of', 'artificial', 'intelligence', '.'], ['machine', 'learning', 'algorithms', 'build', 'a', 'model', 'based', 'on', 'sample', 'data', ',', 'known', 'as', 'training', 'data', ',', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'do', 'so', '.']]


In [3]:
# Function to build N-gram language model (default is bigram model)
def build_ngram_model(tokens, n=2):
    ngrams_model = defaultdict(Counter)
    for sentence in tokens:
        for ngram in ngrams(sentence, n):
            context, next_word = ngram[:-1], ngram[-1]
            ngrams_model[context][next_word] += 1
    return ngrams_model

# Function to predict the next word using the trained N-gram model
def predict_next_word(model, context, n=2):
    context = tuple(context[-(n-1):])  # Adjust the context for the N-gram model
    if context in model:
        possible_words = model[context]
        return max(possible_words, key=possible_words.get)  # Return the word with the highest frequency
    else:
        return None  # If no context match, return None



In [4]:
# Train a bigram (2-gram) model
bigram_model = build_ngram_model(tokens, n=2)

print(bigram_model)

defaultdict(<class 'collections.Counter'>, {('machine',): Counter({'learning': 2}), ('learning',): Counter({'is': 1, 'algorithms': 1}), ('is',): Counter({'the': 1, 'seen': 1}), ('the',): Counter({'study': 1}), ('study',): Counter({'of': 1}), ('of',): Counter({'computer': 1, 'artificial': 1}), ('computer',): Counter({'algorithms': 1}), ('algorithms',): Counter({'that': 1, 'build': 1}), ('that',): Counter({'improve': 1}), ('improve',): Counter({'automatically': 1}), ('automatically',): Counter({'through': 1}), ('through',): Counter({'experience': 1}), ('experience',): Counter({'.': 1}), ('it',): Counter({'is': 1}), ('seen',): Counter({'as': 1}), ('as',): Counter({'a': 1, 'training': 1}), ('a',): Counter({'subset': 1, 'model': 1}), ('subset',): Counter({'of': 1}), ('artificial',): Counter({'intelligence': 1}), ('intelligence',): Counter({'.': 1}), ('build',): Counter({'a': 1}), ('model',): Counter({'based': 1}), ('based',): Counter({'on': 1}), ('on',): Counter({'sample': 1}), ('sample',):

In [5]:
# Test the bigram model by predicting the next word in a given context
context = ["machine", "learning"]
predicted_word = predict_next_word(bigram_model, context, n=2)

print(f"Context: {' '.join(context)}")
print(f"Predicted next word: {predicted_word}")



Context: machine learning
Predicted next word: is


In [6]:
# Function to predict multiple words to complete the sentence
def complete_sentence(model, seed_text, num_words=5, n=2):
    sentence = seed_text.copy()
    for _ in range(num_words):
        next_word = predict_next_word(model, sentence, n)
        if next_word is None:
            break  # Stop if no next word is found
        sentence.append(next_word)
    return sentence

# Predicting the next 5 words starting from the given context
completed_sentence = complete_sentence(bigram_model, context, num_words=5, n=2)
print(f"Completed sentence: {' '.join(completed_sentence)}")


Completed sentence: machine learning is the study of computer
