<a href="https://colab.research.google.com/github/2403a52030-sketch/NLP-LAB/blob/main/NLP_lab_8_2403a52030.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Natural Language Toolkit for tokenization and n-grams
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords

# For text cleaning and mathematical operations
import re
import math

# For counting word frequencies
from collections import Counter, defaultdict

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Load dataset from the correct path
with open("/content/corpus.txt", "r", encoding="utf-8") as f:
    corpus = f.read()

# Display sample text
print(corpus[:500])



Natural language processing is a subfield of artificial intelligence that focuses on the interaction
between computers and human language. It enables machines to read, understand, and generate text
in a way that is meaningful. Over the years, natural language processing has evolved rapidly due to
advances in machine learning and the availability of large datasets.

Language models play a central role in natural language processing. A language model assigns
probabilities to sequences of words an


In [11]:
def preprocess_text(text):
    text = text.lower()                             # lowercase
    text = re.sub(r'[^a-z\s]', '', text)            # remove punctuation & numbers
    tokens = nltk.word_tokenize(text)               # tokenize
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    return tokens

tokens = preprocess_text(corpus)

# Train-test split
split = int(0.8 * len(tokens))
train_tokens = tokens[:split]
test_tokens = tokens[split:]

print("Training tokens:", len(train_tokens))
print("Testing tokens:", len(test_tokens))


Training tokens: 392
Testing tokens: 99


In [12]:
def build_ngram_model(tokens, n):
    ngram_list = list(ngrams(tokens, n))
    return Counter(ngram_list)

unigram_counts = build_ngram_model(train_tokens, 1)
bigram_counts = build_ngram_model(train_tokens, 2)
trigram_counts = build_ngram_model(train_tokens, 3)

print("Sample Unigrams:", list(unigram_counts.items())[:5])
print("Sample Bigrams:", list(bigram_counts.items())[:5])
print("Sample Trigrams:", list(trigram_counts.items())[:5])


Sample Unigrams: [(('natural',), 6), (('language',), 19), (('processing',), 5), (('subfield',), 1), (('artificial',), 1)]
Sample Bigrams: [(('natural', 'language'), 5), (('language', 'processing'), 5), (('processing', 'subfield'), 1), (('subfield', 'artificial'), 1), (('artificial', 'intelligence'), 1)]
Sample Trigrams: [(('natural', 'language', 'processing'), 5), (('language', 'processing', 'subfield'), 1), (('processing', 'subfield', 'artificial'), 1), (('subfield', 'artificial', 'intelligence'), 1), (('artificial', 'intelligence', 'focuses'), 1)]


In [13]:
vocab_size = len(set(train_tokens))

def laplace_probability(ngram, ngram_counts, context_counts):
    return (ngram_counts[ngram] + 1) / (context_counts + vocab_size)


In [14]:
def sentence_probability(sentence, n, ngram_counts, context_counts):
    tokens = preprocess_text(sentence)
    sentence_ngrams = list(ngrams(tokens, n))
    prob = 1

    for ng in sentence_ngrams:
        prob *= laplace_probability(ng, ngram_counts, context_counts)

    return prob

sentences = [
    "natural language processing is interesting",
    "language models predict words",
    "this is a simple example",
    "data science uses python",
    "nltk is useful for nlp"
]

for s in sentences:
    print("\nSentence:", s)
    print("Unigram Prob:", sentence_probability(s, 1, unigram_counts, len(train_tokens)))
    print("Bigram Prob:", sentence_probability(s, 2, bigram_counts, len(train_tokens)))
    print("Trigram Prob:", sentence_probability(s, 3, trigram_counts, len(train_tokens)))



Sentence: natural language processing is interesting
Unigram Prob: 5.069867055196761e-09
Bigram Prob: 1.386246506235229e-07
Trigram Prob: 1.4740421182967935e-05

Sentence: language models predict words
Unigram Prob: 2.897066888683864e-08
Bigram Prob: 3.0805477916338414e-08
Trigram Prob: 2.456736863827989e-06

Sentence: this is a simple example
Unigram Prob: 4.913473727655978e-06
Bigram Prob: 0.001567398119122257
Trigram Prob: 1

Sentence: data science uses python
Unigram Prob: 4.828444814473106e-11
Bigram Prob: 3.850684739542302e-09
Trigram Prob: 2.456736863827989e-06

Sentence: nltk is useful for nlp
Unigram Prob: 3.850684739542302e-09
Bigram Prob: 2.456736863827989e-06
Trigram Prob: 0.001567398119122257


In [16]:
def perplexity(sentence, n, ngram_counts, context_counts):
    tokens = preprocess_text(sentence)
    sentence_ngrams = list(ngrams(tokens, n))
    log_prob = 0

    if not sentence_ngrams: # Handle cases where no n-grams are formed
        return float('inf') # Perplexity is undefined or infinite in this case

    for ng in sentence_ngrams:
        p = laplace_probability(ng, ngram_counts, context_counts)
        log_prob += math.log(p)

    return math.exp(-log_prob / len(sentence_ngrams))

for s in sentences:
    print("\nSentence:", s)
    print("Unigram Perplexity:", perplexity(s, 1, unigram_counts, len(train_tokens)))
    print("Bigram Perplexity:", perplexity(s, 2, bigram_counts, len(train_tokens)))
    print("Trigram Perplexity:", perplexity(s, 3, trigram_counts, len(train_tokens)))



Sentence: natural language processing is interesting
Unigram Perplexity: 118.50887003447242
Bigram Perplexity: 193.22048970448424
Trigram Perplexity: 260.4624093159446

Sentence: language models predict words
Unigram Perplexity: 76.64968411089865
Bigram Perplexity: 318.99999999999994
Trigram Perplexity: 637.9999999999999

Sentence: this is a simple example
Unigram Perplexity: 451.13412639701727
Bigram Perplexity: 637.9999999999999
Trigram Perplexity: inf

Sentence: data science uses python
Unigram Perplexity: 379.3570696858681
Bigram Perplexity: 637.9999999999999
Trigram Perplexity: 637.9999999999999

Sentence: nltk is useful for nlp
Unigram Perplexity: 637.9999999999999
Bigram Perplexity: 637.9999999999999
Trigram Perplexity: 637.9999999999999
