In [None]:
## SNEAHI SHAH
## I064
## NLP-LAB4

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.util import ngrams

In [None]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [None]:
hamlet_tokens = [word.lower() for word in gutenberg.words('shakespeare-hamlet.txt')]

In [None]:
## TASK 1
# UNIGRAMS:
hamlet_unigrams = list(ngrams(hamlet_tokens, 1))
print("First 10 unigrams of Hamlet:")
print(hamlet_unigrams[:10])

# BIGRAMS:
hamlet_bigrams = list(ngrams(hamlet_tokens, 2))
print("\nFirst 10 bigrams of Hamlet:")
print(hamlet_bigrams[:10])

# TRIGRAMS:
hamlet_trigrams = list(ngrams(hamlet_tokens, 3))
print("\nFirst 10 trigrams of Hamlet:")
print(hamlet_trigrams[:10])

First 10 unigrams of Hamlet:
[('[',), ('the',), ('tragedie',), ('of',), ('hamlet',), ('by',), ('william',), ('shakespeare',), ('1599',), (']',)]

First 10 bigrams of Hamlet:
[('[', 'the'), ('the', 'tragedie'), ('tragedie', 'of'), ('of', 'hamlet'), ('hamlet', 'by'), ('by', 'william'), ('william', 'shakespeare'), ('shakespeare', '1599'), ('1599', ']'), (']', 'actus')]

First 10 trigrams of Hamlet:
[('[', 'the', 'tragedie'), ('the', 'tragedie', 'of'), ('tragedie', 'of', 'hamlet'), ('of', 'hamlet', 'by'), ('hamlet', 'by', 'william'), ('by', 'william', 'shakespeare'), ('william', 'shakespeare', '1599'), ('shakespeare', '1599', ']'), ('1599', ']', 'actus'), (']', 'actus', 'primus')]


In [None]:
## TASK 3
from collections import Counter # Import Counter

def build_model(ngrams_list):
  model = {}
  for gram in ngrams_list:
    context = gram[:-1]
    next_word = gram[-1]
    if context not in model:
      model[context] = Counter()
    # Increment the count for the next word in the given context
    model[context][next_word] += 1
  return model

In [None]:
def predict_next_word(model, context_tuple):
    if context_tuple in model:
        # Return the most common word in the Counter for this context
        return model[context_tuple].most_common(1)[0][0]
    else:
        return "No prediction available for this context."

In [None]:
# Build the bigram model
bigram_model = build_model(hamlet_bigrams)

In [None]:
# Demonstrate bigram prediction
print("\n--- Bigram Model Prediction ---")
bigram_context = ('of',)
predicted_word_bigram = predict_next_word(bigram_model, bigram_context)
print(f"Context: {bigram_context}")
print(f"Predicted next word: '{predicted_word_bigram}'")
print(f"(Based on the most common word following '{bigram_context[0]}')")


--- Bigram Model Prediction ---
Context: ('of',)
Predicted next word: 'the'
(Based on the most common word following 'of')


In [1]:
## TASK 2
import nltk
from nltk import bigrams
from nltk.probability import ConditionalFreqDist, ConditionalProbDist, MLEProbDist

In [2]:
# Training sentences
sentences = [
    "This is a dog",
    "This is a cat",
    "I love my cat",
    "This is my name"
]

In [3]:
test_sentence = "This is my cat"

In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
def preprocess(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    return ["<s>"] + tokens + ["</s>"]

In [8]:
# Tokenize training sentences and generate bigrams
train_tokens = [preprocess(sent) for sent in sentences]

# Flatten the list of tokens for unigram counts
all_words = [word for sent in train_tokens for word in sent]

# Create Conditional Frequency Distribution for bigrams
cfd = ConditionalFreqDist()

for sent in train_tokens:
    for w1, w2 in bigrams(sent):
        cfd[w1][w2] += 1

In [10]:
# Create Conditional Probability Distribution using MLE
cpd = ConditionalProbDist(cfd, MLEProbDist)

# Preprocess the test sentence
test_tokens = preprocess(test_sentence)

In [11]:
# Calculate bigram probability of the test sentence
prob = 1.0
for w1, w2 in bigrams(test_tokens):
    if w1 in cpd:
        prob_w2_given_w1 = cpd[w1].prob(w2)
        # If bigram probability is zero (not seen), overall prob is zero
        if prob_w2_given_w1 == 0:
            prob = 0
            break
        prob *= prob_w2_given_w1
    else:
        prob = 0
        break

print(f"Probability of the sentence '{test_sentence}' is: {prob}")

Probability of the sentence 'This is my cat' is: 0.125


In [12]:
## TASK 4
def preprocess(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    return ["<s>"] + tokens + ["</s>"]

In [14]:
import math
from nltk.util import bigrams

def calculate_perplexity(sentence, cpd_model):
    """
    Calculates the perplexity of a sentence given a bigram probability model.

    Args:
        sentence (str): The input sentence.
        cpd_model (ConditionalProbDist): The bigram probability model (ConditionalProbDist).

    Returns:
        float: The perplexity of the sentence.
    """
    # Preprocess the sentence (add <s> and </s>)
    tokens = preprocess(sentence)
    n = len(tokens) - 1 # Number of bigrams

    # Calculate the probability of the sentence using the bigram model
    sentence_prob = 1.0
    for w1, w2 in bigrams(tokens):
        if w1 in cpd_model and cpd_model[w1].prob(w2) > 0:
            sentence_prob *= cpd_model[w1].prob(w2)
        else:
            # Handle unseen bigrams (assign a very small probability or use smoothing)
            # For simplicity here, we'll assign a small probability
            sentence_prob *= 1e-10 # Assign a small probability for unseen bigrams

    # Calculate perplexity
    if sentence_prob == 0:
        return float('inf')  # Perplexity is infinite if probability is zero
    perplexity = (1/sentence_prob)**(1/n)
    return perplexity

In [15]:
# Calculate and print the perplexity of the test sentence
perplexity_score = calculate_perplexity(test_sentence, cpd)
print(f"Perplexity of the sentence is: {perplexity_score:.4f}")

Perplexity of the sentence is: 1.5157


Conclusion:
The import nltk toolkit provides pre-built functions for tasks like tokenizing sentences into words. From nltk.util import ngrams selects a powerful tool for grouping words into sequences. From collections import Counter provides an efficient tool for counting word or word pair appearances in training data.