<a href="https://colab.research.google.com/github/2403a52029-lab/NLP_LAB-ASSIGNMENTS/blob/main/Lab8_NGram_Model_NithinPatil_2403A52029.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import nltk # Import the Natural Language Toolkit library
import re # Import regular expression operations
import math # Import mathematical functions
import random # Import random number generation functions
from collections import defaultdict, Counter # Import defaultdict and Counter from collections module
import numpy as np # Import NumPy for numerical operations

In [15]:
# Example: Load text file from the specified path
with open("/content/corpus.txt", "r", encoding="utf-8") as f:
    text = f.read() # Read the entire content of the file into the 'text' variable

print(text[:500])  # Print the first 500 characters of the loaded text for a sample display


Natural language processing is a subfield of artificial intelligence that focuses on the interaction
between computers and human language. It enables machines to read, understand, and generate text
in a way that is meaningful. Over the years, natural language processing has evolved rapidly due to
advances in machine learning and the availability of large datasets.

Language models play a central role in natural language processing. A language model assigns
probabilities to sequences of words an


In [16]:
nltk.download('punkt_tab') # Download the 'punkt_tab' tokenizer from NLTK data
def preprocess_text(text):
    text = text.lower() # Convert all text to lowercase
    text = re.sub(r'[^a-z\s]', '', text) # Remove all characters except lowercase letters and spaces
    sentences = nltk.sent_tokenize(text) # Tokenize the text into sentences

    processed = [] # Initialize an empty list to store processed sentences
    for sent in sentences:
        words = nltk.word_tokenize(sent) # Tokenize each sentence into words
        processed.append(['<s>'] + words + ['</s>']) # Add start (<s>) and end (</s>) tokens to each sentence
    return processed # Return the list of processed sentences

sentences = preprocess_text(text) # Preprocess the loaded text and store the result in 'sentences'

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [17]:
def build_ngram(sentences, n):
    ngrams = [] # Initialize an empty list to store n-grams
    for sent in sentences:
        ngrams.extend(list(nltk.ngrams(sent, n))) # Generate n-grams for each sentence and add to the list
    return Counter(ngrams) # Return a Counter object of the generated n-grams

unigram = build_ngram(sentences, 1) # Build unigrams (single words)
bigram  = build_ngram(sentences, 2) # Build bigrams (two-word sequences)
trigram = build_ngram(sentences, 3) # Build trigrams (three-word sequences)

vocab_size = len(unigram) # Calculate the vocabulary size based on unique unigrams

In [18]:
def unigram_prob(word):
    # Calculate unigram probability with add-1 smoothing
    return (unigram[(word,)] + 1) / (sum(unigram.values()) + vocab_size)

def bigram_prob(w1, w2):
    # Calculate bigram probability with add-1 smoothing
    # The denominator is the count of the first word (w1) plus vocabulary size for smoothing
    return (bigram[(w1, w2)] + 1) / (unigram[(w1,)] + vocab_size)

def trigram_prob(w1, w2, w3):
    # Calculate trigram probability with add-1 smoothing
    # The denominator is the count of the bigram (w1, w2) plus vocabulary size for smoothing
    return (trigram[(w1, w2, w3)] + 1) / (bigram[(w1, w2)] + vocab_size)

In [19]:
def sentence_probability(sentence, model='bigram'):
    # Prepare the sentence by tokenizing and adding start/end tokens
    words = ['<s>'] + nltk.word_tokenize(sentence.lower()) + ['</s>']
    prob = 1.0 # Initialize probability to 1.0

    if model == 'unigram':
        for w in words:
            prob *= unigram_prob(w) # Multiply by unigram probability of each word

    elif model == 'bigram':
        for i in range(len(words)-1):
            # Multiply by bigram probability of (current_word, next_word)
            prob *= bigram_prob(words[i], words[i+1])

    elif model == 'trigram':
        for i in range(len(words)-2):
            # Multiply by trigram probability of (word1, word2, word3)
            prob *= trigram_prob(words[i], words[i+1], words[i+2])

    return prob # Return the total sentence probability

In [20]:
sent = "natural language processing is interesting" # Define a sample sentence
print(sentence_probability(sent, "unigram")) # Calculate and print sentence probability using unigram model
print(sentence_probability(sent, "bigram")) # Calculate and print sentence probability using bigram model
print(sentence_probability(sent, "trigram")) # Calculate and print sentence probability using trigram model

7.732967457781427e-17
1.201408259871537e-13
5.905932332588356e-12


In [21]:
def perplexity(sentence, model='bigram'):
    # Prepare the sentence by tokenizing and adding start/end tokens
    words = ['<s>'] + nltk.word_tokenize(sentence.lower()) + ['</s>']
    log_prob = 0 # Initialize log probability to 0
    N = len(words) # Get the number of words in the sentence

    if model == 'unigram':
        for w in words:
            log_prob += math.log(unigram_prob(w)) # Add log unigram probability of each word

    elif model == 'bigram':
        for i in range(len(words)-1):
            # Add log bigram probability of (current_word, next_word)
            log_prob += math.log(bigram_prob(words[i], words[i+1]))

    elif model == 'trigram':
        for i in range(len(words)-2):
            # Add log trigram probability of (word1, word2, word3)
            log_prob += math.log(trigram_prob(words[i], words[i+1], words[i+2]))

    # Calculate perplexity using the formula exp(-log_prob / N)
    return math.exp(-log_prob / N)

In [22]:
print("Unigram:", perplexity(sent, "unigram")) # Calculate and print perplexity using unigram model
print("Bigram :", perplexity(sent, "bigram")) # Calculate and print perplexity using bigram model
print("Trigram:", perplexity(sent, "trigram")) # Calculate and print perplexity using trigram model

Unigram: 200.29256628401615
Bigram : 70.10653194213862
Trigram: 40.18848619089582
