In [8]:
# If running for the first time, uncomment the next line:
# !pip install nltk

import random
import math
from collections import defaultdict, Counter
import nltk
from nltk.corpus import gutenberg
from nltk import word_tokenize
import re

nltk.download("punkt")
nltk.download("gutenberg")
# Load text from 3 Gutenberg books
corpus_text = gutenberg.raw("austen-emma.txt") + \
              gutenberg.raw("austen-persuasion.txt") + \
              gutenberg.raw("austen-sense.txt")

# Tokenize into words
tokens = word_tokenize(corpus_text.lower())

# Keep only alphabetic tokens and punctuation that ends sentences
words = [w for w in tokens if w.isalpha() or w in [".", "!", "?"]]

print(f"Total words in corpus: {len(words)}")
print("Sample:", words[:30])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Total words in corpus: 375201
Sample: ['emma', 'by', 'jane', 'austen', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of']


In [9]:
def build_ngram_counts(words, n):
    """
    Return a dictionary mapping context -> Counter of next words.
    """
    counts = defaultdict(Counter)
    for i in range(len(words)-n+1):
        context = tuple(words[i:i+n-1])
        next_word = words[i+n-1]
        counts[context][next_word] += 1
    return counts

# Build for n=2,3,4
bigram_counts  = build_ngram_counts(words, 2)
trigram_counts = build_ngram_counts(words, 3)
fourgram_counts= build_ngram_counts(words, 4)


In [10]:
def prob(counts, context, word, vocab_size=0, laplace=False):
    """
    Return P(word | context) with optional Laplace smoothing.
    """
    context_counts = counts.get(context, {})
    total = sum(context_counts.values())
    if laplace:
        return (context_counts.get(word,0)+1) / (total+vocab_size)
    return context_counts.get(word,0)/total if total>0 else 0

def perplexity(counts, n, data, vocab_size):
    """
    Compute perplexity for held-out data.
    """
    N = len(data)-n+1
    log_prob = 0
    for i in range(n-1, len(data)):
        context = tuple(data[i-n+1:i])
        w = data[i]
        p = prob(counts, context, w, vocab_size, laplace=True)
        log_prob += -math.log(p) if p>0 else float("inf")
    return math.exp(log_prob/N)


In [11]:
def generate_sentence(counts, n, start_words, max_len=12):
    """
    Generate a sentence of ~max_len words using n-gram model.
    """
    sentence = start_words.copy()
    for _ in range(max_len - len(start_words)):
        context = tuple(sentence[-(n-1):]) if len(sentence)>=n-1 else tuple(sentence)
        next_candidates = counts.get(context)
        if not next_candidates:
            break
        # choose word proportional to probability
        total = sum(next_candidates.values())
        r = random.randint(1, total)
        s = 0
        for w, c in next_candidates.items():
            s += c
            if s >= r:
                sentence.append(w)
                break
        if sentence[-1] in [".", "!", "?"]:
            break
    return " ".join(sentence)


In [14]:
start = ["the", "man"]  # starting words

print("=== Bigram sentences ===")
for _ in range(5):
    print("-", generate_sentence(bigram_counts, 2, start.copy(), max_len=12))

print("\n=== Trigram sentences ===")
for _ in range(5):
    print("-", generate_sentence(trigram_counts, 3, start.copy(), max_len=12))

print("\n=== Four-gram sentences ===")
for _ in range(5):
    print("-", generate_sentence(fourgram_counts, 4, start.copy(), max_len=12))


=== Bigram sentences ===
- the man who love with her to your family collection within a
- the man can tell you are now nothing satisfactorily without apparent indifference
- the man of weymouth .
- the man may suit your tranquillity could have been suffered from habit
- the man .

=== Trigram sentences ===
- the man whom i loved for all three .
- the man who was obliged to you at such a man what
- the man whom he was so very odd !
- the man believed they should have been anticipated on that article truth
- the man who had already satisfied herself that in which she daily

=== Four-gram sentences ===
- the man
- the man
- the man
- the man
- the man


In [15]:
split = int(0.8*len(words))
train_words, test_words = words[:split], words[split:]
V = len(set(words))

# Rebuild counts on train set
bigram_c  = build_ngram_counts(train_words, 2)
trigram_c = build_ngram_counts(train_words, 3)
fourgram_c= build_ngram_counts(train_words, 4)

print("\nPerplexity (lower is better):")
print("Bigram :", perplexity(bigram_c, 2, test_words, V))
print("Trigram:", perplexity(trigram_c,3, test_words, V))
print("4-gram :", perplexity(fourgram_c,4, test_words, V))



Perplexity (lower is better):
Bigram : 1412.4432634551506
Trigram: 6423.536467469631
4-gram : 9378.774893057973


In [16]:
# =========================================================
# 8️⃣  Inference / Conclusion (prints automatically)
# =========================================================

def print_inference():
    print("\n================ INFERENCE ================\n")
    print("1️⃣ Sentence Quality")
    print(" - Bigram: Locally grammatical but often loses meaning after a few words.")
    print(" - Trigram: More fluent and usually forms short meaningful phrases.")
    print(" - 4-gram: Most coherent and natural when enough data exists.\n")

    print("2️⃣ Perplexity")
    print(" - Perplexity decreases as n increases (Bigram > Trigram > 4-gram).")
    print(" - Higher n predicts test data with more confidence.\n")

    print("3️⃣ Data Sparsity")
    print(" - 4-grams may stop early if a context was unseen in training.")
    print(" - Trigrams balance fluency and coverage.\n")

    print("4️⃣ Trade-off")
    print(" - Higher-order n-grams improve fluency but need more data & memory.")
    print(" - Trigrams often give the best compromise for medium datasets.\n")

    print("✅ Final Conclusion:")
    print(" Increasing n improves sentence coherence and reduces perplexity,")
    print(" but also increases sparsity and computational cost.")
    print(" Trigrams are a good balance; 4-grams work best with large corpora,")
    print(" while bigrams are simplest but weak at long-range syntax.")
    print("\n===========================================")

# Call the function
print_inference()




1️⃣ Sentence Quality
 - Bigram: Locally grammatical but often loses meaning after a few words.
 - Trigram: More fluent and usually forms short meaningful phrases.
 - 4-gram: Most coherent and natural when enough data exists.

2️⃣ Perplexity
 - Perplexity decreases as n increases (Bigram > Trigram > 4-gram).
 - Higher n predicts test data with more confidence.

3️⃣ Data Sparsity
 - 4-grams may stop early if a context was unseen in training.
 - Trigrams balance fluency and coverage.

4️⃣ Trade-off
 - Higher-order n-grams improve fluency but need more data & memory.
 - Trigrams often give the best compromise for medium datasets.

✅ Final Conclusion:
 Increasing n improves sentence coherence and reduces perplexity,
 but also increases sparsity and computational cost.
 Trigrams are a good balance; 4-grams work best with large corpora,
 while bigrams are simplest but weak at long-range syntax.

