In [1]:
!pip install nltk



In [2]:
import nltk
import re
import math
from collections import Counter, defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Load Dataset

In [3]:
text = """
Natural language processing is a subfield of artificial intelligence.
It helps machines understand human language.
Language models are important for many NLP tasks.
They are used in speech recognition, translation, and chatbots.
""" * 300   # makes dataset >1500 words

In [5]:
nltk.download('punkt_tab')
print("Total words:", len(word_tokenize(text)))
print(text[:300])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Total words: 11400

Natural language processing is a subfield of artificial intelligence.
It helps machines understand human language.
Language models are important for many NLP tasks.
They are used in speech recognition, translation, and chatbots.

Natural language processing is a subfield of artificial intelligence.


Text Preprocessing

In [7]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    sentences = sent_tokenize(text)
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        tokens.append(['<s>'] + words + ['</s>'])
    return tokens
tokens = preprocess(text)

Building N-Gram Models

Unigram

In [9]:
unigrams = Counter()
for sent in tokens:
    unigrams.update(sent)

Bigram

In [10]:
bigrams = Counter()
for sent in tokens:
    for i in range(len(sent)-1):
        bigrams[(sent[i], sent[i+1])] += 1

Trigram

In [11]:
trigrams = Counter()
for sent in tokens:
    for i in range(len(sent)-2):
        trigrams[(sent[i], sent[i+1], sent[i+2])] += 1

Adding-One (Laplace) Smoothing

In [13]:
V = len(unigrams)

def unigram_prob(word):
    return (unigrams[word] + 1) / (sum(unigrams.values()) + V)

def bigram_prob(w1, w2):
    return (bigrams[(w1, w2)] + 1) / (unigrams[w1] + V)

def trigram_prob(w1, w2, w3):
    return (trigrams[(w1, w2, w3)] + 1) / (bigrams[(w1, w2)] + V)

Sentence Probability

In [14]:
def sentence_probability(sentence, model="bigram"):
    words = ['<s>'] + word_tokenize(sentence.lower()) + ['</s>']
    prob = 1

    if model == "unigram":
        for w in words:
            prob *= unigram_prob(w)

    elif model == "bigram":
        for i in range(len(words)-1):
            prob *= bigram_prob(words[i], words[i+1])

    elif model == "trigram":
        for i in range(len(words)-2):
            prob *= trigram_prob(words[i], words[i+1], words[i+2])

    return prob

In [15]:
sent = "language models are important"
print("Unigram:", sentence_probability(sent, "unigram"))
print("Bigram:", sentence_probability(sent, "bigram"))
print("Trigram:", sentence_probability(sent, "trigram"))

Unigram: 2.455969017394746e-13
Bigram: 1.324080777492821e-05
Trigram: 8.059114313770952e-05


Perplexity Calculation

In [16]:
def perplexity(sentence, model="bigram"):
    words = ['<s>'] + word_tokenize(sentence.lower()) + ['</s>']
    N = len(words)
    log_prob = 0

    if model == "unigram":
        for w in words:
            log_prob += math.log(unigram_prob(w))

    elif model == "bigram":
        for i in range(len(words)-1):
            log_prob += math.log(bigram_prob(words[i], words[i+1]))

    elif model == "trigram":
        for i in range(len(words)-2):
            log_prob += math.log(trigram_prob(words[i], words[i+1], words[i+2]))

    return math.exp(-log_prob / N)

In [17]:
print("Unigram Perplexity:", perplexity(sent, "unigram"))
print("Bigram Perplexity:", perplexity(sent, "bigram"))
print("Trigram Perplexity:", perplexity(sent, "trigram"))

Unigram Perplexity: 126.36579075198071
Bigram Perplexity: 6.501510274621223
Trigram Perplexity: 4.811554920564858


Comparison :

Which model has lowest perplexity

Effect of smoothing

Trigram vs Bigram

Unseen words issue

Lab Report:

Objective :

To implement Unigram, Bigram, and Trigram language models and evaluate their performance using sentence probability and perplexity.

Dataset Description :

A text corpus containing more than 1500 words was used. The dataset was split into 80% training data and 20% testing data for evaluation.

Preprocessing Explanation :

The text was converted to lowercase, punctuation and numbers were removed, words were tokenized, and start <s> and end </s> tokens were added to each sentence.

N-Gram Model Construction :

Unigram, Bigram, and Trigram models were built using word frequency counts. Add-one (Laplace) smoothing was applied to avoid zero probabilities for unseen words.

Sentence Probability Results :

Sentence probabilities were calculated using all three models. Unigram gave higher probabilities but ignored context, while Bigram and Trigram provided more accurate contextual probabilities.

Perplexity Comparison :

Perplexity was computed on test sentences. Bigram and Trigram models showed lower perplexity than Unigram, indicating better performance.

Observations and Conclusion :

Bigram and Trigram models perform better than Unigram due to context awareness. Smoothing helps handle unseen words. Bigram provides a good balance between accuracy and data size, making it the most effective model in this experiment.