### Importing libraries

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter


### Corpus

In [None]:
# Sample text corpus
text = """
The quick brown fox jumps over the lazy dog.
This is a sample text corpus for demonstration.
It contains multiple sentences with different words.
"""
text = input("Enter the text sample").strip()

Enter the text sampleThe quick brown fox jumps over the lazy dog. This is a sample text corpus for demonstration. It contains multiple sentences with different words.


In [None]:
# Tokenize the text into words
token = word_tokenize(text.lower())
print(token)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'this', 'is', 'a', 'sample', 'text', 'corpus', 'for', 'demonstration', '.', 'it', 'contains', 'multiple', 'sentences', 'with', 'different', 'words', '.']


### 1. Unigrams

In [None]:
unigram = list(ngrams(token, 1))
print("Unigram:", unigram)

Unigram: [('the',), ('quick',), ('brown',), ('fox',), ('jumps',), ('over',), ('the',), ('lazy',), ('dog',), ('.',), ('this',), ('is',), ('a',), ('sample',), ('text',), ('corpus',), ('for',), ('demonstration',), ('.',), ('it',), ('contains',), ('multiple',), ('sentences',), ('with',), ('different',), ('words',), ('.',)]


### 2. Bigrams

In [None]:
bigram = list(ngrams(token, 2))
print("Bigrams:", list(bigram))

Bigrams: [('the', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog'), ('dog', '.'), ('.', 'this'), ('this', 'is'), ('is', 'a'), ('a', 'sample'), ('sample', 'text'), ('text', 'corpus'), ('corpus', 'for'), ('for', 'demonstration'), ('demonstration', '.'), ('.', 'it'), ('it', 'contains'), ('contains', 'multiple'), ('multiple', 'sentences'), ('sentences', 'with'), ('with', 'different'), ('different', 'words'), ('words', '.')]


### 3. Trigrams

In [None]:
trigram = list(ngrams(token, 3))
print("Trigram:", list(trigram))


Trigram: [('the', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'), ('fox', 'jumps', 'over'), ('jumps', 'over', 'the'), ('over', 'the', 'lazy'), ('the', 'lazy', 'dog'), ('lazy', 'dog', '.'), ('dog', '.', 'this'), ('.', 'this', 'is'), ('this', 'is', 'a'), ('is', 'a', 'sample'), ('a', 'sample', 'text'), ('sample', 'text', 'corpus'), ('text', 'corpus', 'for'), ('corpus', 'for', 'demonstration'), ('for', 'demonstration', '.'), ('demonstration', '.', 'it'), ('.', 'it', 'contains'), ('it', 'contains', 'multiple'), ('contains', 'multiple', 'sentences'), ('multiple', 'sentences', 'with'), ('sentences', 'with', 'different'), ('with', 'different', 'words'), ('different', 'words', '.')]


### 4. Bigram Probabilities

In [None]:
b_count = Counter(bigram)
total = len(bigram)
b_probs = {}

for bi, count in b_count.items():
    f= bi[0]
    prob = count / token.count(f)
    b_probs[bi] = prob

print("\nBigram Probabilities:")
for b, prob in b_probs.items():
    print(b, ":", prob)


Bigram Probabilities:
('the', 'quick') : 0.5
('quick', 'brown') : 1.0
('brown', 'fox') : 1.0
('fox', 'jumps') : 1.0
('jumps', 'over') : 1.0
('over', 'the') : 1.0
('the', 'lazy') : 0.5
('lazy', 'dog') : 1.0
('dog', '.') : 1.0
('.', 'this') : 0.3333333333333333
('this', 'is') : 1.0
('is', 'a') : 1.0
('a', 'sample') : 1.0
('sample', 'text') : 1.0
('text', 'corpus') : 1.0
('corpus', 'for') : 1.0
('for', 'demonstration') : 1.0
('demonstration', '.') : 1.0
('.', 'it') : 0.3333333333333333
('it', 'contains') : 1.0
('contains', 'multiple') : 1.0
('multiple', 'sentences') : 1.0
('sentences', 'with') : 1.0
('with', 'different') : 1.0
('different', 'words') : 1.0
('words', '.') : 1.0


### 5. Next Word Prediction

In [None]:
def next_word(prev_word, top_n=3):
  psb= [b for b in b_count.keys() if b[0] == prev_word]
  sorted_bs = sorted(psb, key=lambda b: b_probs[b], reverse=True)
  return [b[1] for b in sorted_bs[:top_n]]
prev_word = input("Enter the previous word: ").strip()
pdt_word = next_word(prev_word)
print("Next word prediction for '{}': {}".format(prev_word, pdt_word))


Enter the previous word: the
Next word prediction for 'the': ['quick', 'lazy']
