<a href="https://colab.research.google.com/github/Bhavana123448/NLP1/blob/main/2403A54069_Lab08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
doc1 = "Natural language processing enables computers to understand human language."

doc2 = "Machine learning improves system performance through experience."

doc3 = "Deep learning uses neural networks for complex tasks."

doc4 = "Text preprocessing removes noise from raw data."


In [69]:
documents = [doc1, doc2, doc3, doc4]
corpus = " ".join(documents)

print("Total Documents:", len(documents))
print(corpus)


Total Documents: 4
Natural language processing enables computers to understand human language. Machine learning improves system performance through experience. Deep learning uses neural networks for complex tasks. Text preprocessing removes noise from raw data.


In [70]:
import re
from collections import defaultdict, Counter
import numpy as np
import random


In [71]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s.]', '', text)

    sentences = text.split('.')
    processed = []

    for sentence in sentences:
        words = sentence.strip().split()
        if len(words) > 0:
            words = ['<s>'] + words + ['</s>']
            processed.append(words)

    return processed

sentences = preprocess(corpus)

print(sentences)

[['<s>', 'natural', 'language', 'processing', 'enables', 'computers', 'to', 'understand', 'human', 'language', '</s>'], ['<s>', 'machine', 'learning', 'improves', 'system', 'performance', 'through', 'experience', '</s>'], ['<s>', 'deep', 'learning', 'uses', 'neural', 'networks', 'for', 'complex', 'tasks', '</s>'], ['<s>', 'text', 'preprocessing', 'removes', 'noise', 'from', 'raw', 'data', '</s>']]


### Unigram Counts

In [72]:
print("Total unique words (vocab_size):", vocab_size)
print("Total words in training data: ", total_words)
print("Top 10 most common unigrams:")
for word, count in unigram_counts.most_common(10):
    print(f"  {word}: {count}")

Total unique words (vocab_size): 24
Total words in training data:  30
Top 10 most common unigrams:
  <s>: 3
  </s>: 3
  language: 2
  learning: 2
  natural: 1
  processing: 1
  enables: 1
  computers: 1
  to: 1
  understand: 1


### Bigram Counts

In [98]:
print("Sample bigram counts (e.g., for 'natural' or 'deep'):")
print("  natural:", dict(bigram_counts['natural']))
print("  deep:", dict(bigram_counts['deep']))


Sample bigram counts (e.g., for 'natural' or 'deep'):
  natural: {'language': 1}
  deep: {'learning': 1}


### Trigram Counts

In [74]:
print("Sample trigram counts (e.g., for ('s', 'natural') or ('machine', 'learning')):")
print("  <s> natural:", dict(trigram_counts['<s>']['natural']))
print("  machine learning:", dict(trigram_counts['machine']['learning']))
# You can check other pairs as well, for instance:
# print("  deep learning:", dict(trigram_counts['deep']['learning']))

Sample trigram counts (e.g., for ('s', 'natural') or ('machine', 'learning')):
  <s> natural: {'language': 1}
  machine learning: {'improves': 1}


In [75]:
split_index = int(0.8 * len(sentences))
train_data = sentences[:split_index]
test_data = sentences[split_index:]

print("Train:", train_data)
print("Test:", test_data)


Train: [['<s>', 'natural', 'language', 'processing', 'enables', 'computers', 'to', 'understand', 'human', 'language', '</s>'], ['<s>', 'machine', 'learning', 'improves', 'system', 'performance', 'through', 'experience', '</s>'], ['<s>', 'deep', 'learning', 'uses', 'neural', 'networks', 'for', 'complex', 'tasks', '</s>']]
Test: [['<s>', 'text', 'preprocessing', 'removes', 'noise', 'from', 'raw', 'data', '</s>']]


### Next Word Prediction (Trigram Model)

In [76]:
def predict_next_word_trigram(w1, w2, top_n=1):
    if w1 not in trigram_counts or w2 not in trigram_counts[w1] or not trigram_counts[w1][w2]:
        print(f"No trigram data for '({w1}, {w2})'. Falling back to bigram prediction for '{w2}'.")
        return predict_next_word_bigram(w2, top_n)

    next_word_probabilities = {}
    current_context_sum = sum(trigram_counts[w1][w2].values())

    for next_word in unigram_counts.keys():

        if next_word == '<s>' and (w1 != '<s>' or w2 != '<s>'):
            continue
        next_word_probabilities[next_word] = trigram_prob(w1, w2, next_word)

    sorted_predictions = sorted(next_word_probabilities.items(), key=lambda item: item[1], reverse=True)
    predicted_words = [word for word, prob in sorted_predictions[:top_n]]

    return predicted_words

# Example Usage:
print("Predicting next word after ('<s>', 'natural'):", predict_next_word_trigram('<s>', 'natural'))
print("Predicting next word after ('machine', 'learning'):", predict_next_word_trigram('machine', 'learning'))
print("Predicting next word after ('deep', 'learning'):", predict_next_word_trigram('deep', 'learning'))
print("Predicting next word after ('to', 'understand'):", predict_next_word_trigram('to', 'understand'))
print("Predicting next word after ('nonexistent', 'context'):", predict_next_word_trigram('nonexistent', 'context'))

Predicting next word after ('<s>', 'natural'): ['language']
Predicting next word after ('machine', 'learning'): ['improves']
Predicting next word after ('deep', 'learning'): ['uses']
Predicting next word after ('to', 'understand'): ['human']
No trigram data for '(nonexistent, context)'. Falling back to bigram prediction for 'context'.
No bigram data for 'context'. Returning random words.
Predicting next word after ('nonexistent', 'context'): ['networks']


### Next Word Prediction (Bigram Model)

In [77]:
def predict_next_word_bigram(current_word, top_n=1):
    if current_word not in bigram_counts or not bigram_counts[current_word]:
        print(f"No bigram data for '{current_word}'. Returning random words.")
        return random.choices(list(unigram_counts.keys()), k=top_n)

    next_word_probabilities = {}
    total_possible_next_words = sum(bigram_counts[current_word].values()) + vocab_size

    for next_word in bigram_counts[current_word]:
        next_word_probabilities[next_word] = bigram_prob(current_word, next_word)


    for word in unigram_counts.keys():
        if word not in next_word_probabilities and word != '<s>':
            next_word_probabilities[word] = bigram_prob(current_word, word)

    sorted_predictions = sorted(next_word_probabilities.items(), key=lambda item: item[1], reverse=True)

    predicted_words = [word for word, prob in sorted_predictions[:top_n]]

    return predicted_words


# Example Usage:
print("Predicting next word after 'natural':", predict_next_word_bigram('natural'))
print("Predicting next word after 'machine':", predict_next_word_bigram('machine'))
print("Predicting next word after 'language':", predict_next_word_bigram('language'))
print("Predicting next word after 'deep':", predict_next_word_bigram('deep'))
print("Predicting next word after 'nonexistent_word':", predict_next_word_bigram('nonexistent_word'))

Predicting next word after 'natural': ['language']
Predicting next word after 'machine': ['learning']
Predicting next word after 'language': ['processing']
Predicting next word after 'deep': ['learning']
No bigram data for 'nonexistent_word'. Returning random words.
Predicting next word after 'nonexistent_word': ['improves']


### Text Generation (Bigram Model)

In [93]:
def generate_sentence_bigram(max_length=15, k=1):
    sentence = ['<s>']
    current_word = '<s>'

    while current_word != '</s>' and len(sentence) < max_length:
        next_word = predict_next_word_bigram(current_word, k=k)[0] # Get the top predicted word, passing k
        sentence.append(next_word)
        current_word = next_word

    return " ".join(sentence)

print("Generated sentence (Bigram, k=1):", generate_sentence_bigram(k=1))
print("Generated sentence (Bigram, k=0.5):", generate_sentence_bigram(k=0.5))

Generated sentence (Bigram, k=1): <s> natural language processing enables computers to understand human language processing enables computers to understand
Generated sentence (Bigram, k=0.5): <s> natural language processing enables computers to understand human language processing enables computers to understand


### Text Generation (Trigram Model)

In [94]:
def generate_sentence_trigram(max_length=15, k=1):
    sentence = ['<s>', '<s>'] # Start with two start tokens for trigram context

    while sentence[-1] != '</s>' and len(sentence) < max_length + 1:
        w1 = sentence[-2]
        w2 = sentence[-1]

        next_word = predict_next_word_trigram(w1, w2, k=k)[0] # Get the top predicted word, passing k
        sentence.append(next_word)

    return " ".join(sentence[1:])

print("Generated sentence (Trigram, k=1):", generate_sentence_trigram(k=1))
print("Generated sentence (Trigram, k=0.5):", generate_sentence_trigram(k=0.5))

No trigram data for '(<s>, <s>)'. Falling back to bigram prediction for '<s>'.
Generated sentence (Trigram, k=1): <s> natural language processing enables computers to understand human language </s>
No trigram data for '(<s>, <s>)'. Falling back to bigram prediction for '<s>'.
Generated sentence (Trigram, k=0.5): <s> natural language processing enables computers to understand human language </s>


### Demonstrating Add-k Smoothing

In [80]:
# Example for bigram probability with k=0.5
print("Bigram Probability ('natural', 'language') with k=0.5:", bigram_prob('natural', 'language', k=0.5))
print("Bigram Probability ('nonexistent_word', 'random_word') with k=0.5:", bigram_prob('nonexistent_word', 'random_word', k=0.5))

# Example for trigram probability with k=2
print("Trigram Probability ('<s>', 'natural', 'language') with k=2:", trigram_prob('<s>', 'natural', 'language', k=2))
print("Trigram Probability ('nonexistent', 'context', 'word') with k=2:", trigram_prob('nonexistent', 'context', 'word', k=2))

Bigram Probability ('natural', 'language') with k=0.5: 0.11538461538461539
Bigram Probability ('nonexistent_word', 'random_word') with k=0.5: 0.041666666666666664
Trigram Probability ('<s>', 'natural', 'language') with k=2: 0.061224489795918366
Trigram Probability ('nonexistent', 'context', 'word') with k=2: 0.041666666666666664


### Trigram Probability Function with Laplace Smoothing (Add-one smoothing)

In [81]:
def trigram_prob(w1, w2, w3):
    return (trigram_counts[w1][w2][w3] + 1) / (sum(trigram_counts[w1][w2].values()) + vocab_size)
print("Example trigram_prob('<s>', 'natural', 'language'):", trigram_prob('<s>', 'natural', 'language'))
print("Example trigram_prob('nonexistent', 'context', 'word'):", trigram_prob('nonexistent', 'context', 'word'))

Example trigram_prob('<s>', 'natural', 'language'): 0.08
Example trigram_prob('nonexistent', 'context', 'word'): 0.041666666666666664


### Next Word Prediction (Trigram Model) using Laplace Smoothing

In [92]:
def predict_next_word_trigram(w1, w2, top_n=1, k=1):
    if w1 not in trigram_counts or w2 not in trigram_counts[w1] or not trigram_counts[w1][w2]:
        print(f"No trigram data for '({w1}, {w2})'. Falling back to bigram prediction for '{w2}'.")
        return predict_next_word_bigram(w2, top_n, k=k)

    next_word_probabilities = {}

    for next_word in unigram_counts.keys():
        if next_word == '<s>' and (w1 != '<s>' or w2 != '<s>'):
            continue
        next_word_probabilities[next_word] = trigram_prob(w1, w2, next_word, k=k)

    sorted_predictions = sorted(next_word_probabilities.items(), key=lambda item: item[1], reverse=True)
    predicted_words = [word for word, prob in sorted_predictions[:top_n]]

    return predicted_words

# Example Usage:
print("Predicting next word after ('<s>', 'natural') (k=1):", predict_next_word_trigram('<s>', 'natural', k=1))
print("Predicting next word after ('machine', 'learning') (k=0.5):", predict_next_word_trigram('machine', 'learning', k=0.5))
print("Predicting next word after ('nonexistent', 'context') (k=1):", predict_next_word_trigram('nonexistent', 'context', k=1))

Predicting next word after ('<s>', 'natural') (k=1): ['language']
Predicting next word after ('machine', 'learning') (k=0.5): ['improves']
No trigram data for '(nonexistent, context)'. Falling back to bigram prediction for 'context'.
No bigram data for 'context'. Returning random words.
Predicting next word after ('nonexistent', 'context') (k=1): ['neural']


In [83]:
def build_unigram(data):
    counts = Counter()
    for sentence in data:
        counts.update(sentence)
    return counts

unigram_counts = build_unigram(train_data)
total_words = sum(unigram_counts.values())
vocab_size = len(unigram_counts)


### Bigram Probability Function with Laplace Smoothing (Add-one smoothing)

In [84]:
def bigram_prob(w1, w2):
    return (bigram_counts[w1][w2] + 1) / (sum(bigram_counts[w1].values()) + vocab_size)

print("Example bigram_prob('natural', 'language'):", bigram_prob('natural', 'language'))
print("Example bigram_prob('nonexistent_word', 'random_word'):", bigram_prob('nonexistent_word', 'random_word'))

Example bigram_prob('natural', 'language'): 0.08
Example bigram_prob('nonexistent_word', 'random_word'): 0.041666666666666664


### Next Word Prediction (Bigram Model) using Laplace Smoothing

In [91]:
def predict_next_word_bigram(current_word, top_n=1, k=1):
    if current_word not in bigram_counts or not bigram_counts[current_word]:
        print(f"No bigram data for '{current_word}'. Returning random words.")
        return random.choices(list(unigram_counts.keys()), k=top_n)

    next_word_probabilities = {}

    for next_word in unigram_counts.keys():
        if next_word == '<s>' and current_word != '<s>':
            continue
        next_word_probabilities[next_word] = bigram_prob(current_word, next_word, k=k)

    sorted_predictions = sorted(next_word_probabilities.items(), key=lambda item: item[1], reverse=True)
    predicted_words = [word for word, prob in sorted_predictions[:top_n]]

    return predicted_words

# Example Usage:
print("Predicting next word after 'natural' (k=1):", predict_next_word_bigram('natural', k=1))
print("Predicting next word after 'machine' (k=0.5):", predict_next_word_bigram('machine', k=0.5))
print("Predicting next word after 'nonexistent_word' (k=1):", predict_next_word_bigram('nonexistent_word', k=1))

Predicting next word after 'natural' (k=1): ['language']
Predicting next word after 'machine' (k=0.5): ['learning']
No bigram data for 'nonexistent_word'. Returning random words.
Predicting next word after 'nonexistent_word' (k=1): ['natural']


In [97]:
def build_bigram(data):
    counts = defaultdict(Counter)
    for sentence in data:
        for i in range(len(sentence)-1):
            counts[sentence[i]][sentence[i+1]] += 1
    return counts

bigram_counts = build_bigram(train_data)


In [87]:
def build_trigram(data):
    counts = defaultdict(lambda: defaultdict(Counter))
    for sentence in data:
        for i in range(len(sentence)-2):
            counts[sentence[i]][sentence[i+1]][sentence[i+2]] += 1
    return counts

trigram_counts = build_trigram(train_data)


In [88]:
def unigram_prob(word, k=1):
    return (unigram_counts[word] + k) / (total_words + k * vocab_size)

def bigram_prob(w1, w2, k=1):
    return (bigram_counts[w1][w2] + k) / (sum(bigram_counts[w1].values()) + k * vocab_size)

def trigram_prob(w1, w2, w3, k=1):
    return (trigram_counts[w1][w2][w3] + k) / (sum(trigram_counts[w1][w2].values()) + k * vocab_size)

In [96]:
def sentence_probability(sentence, model):
    prob = 1

    if model == "unigram":
        for w in sentence:
            prob *= unigram_prob(w)

    elif model == "bigram":
        for i in range(len(sentence)-1):
            prob *= bigram_prob(sentence[i], sentence[i+1])

    elif model == "trigram":
        for i in range(len(sentence)-2):
            prob *= trigram_prob(sentence[i], sentence[i+1], sentence[i+2])

    return prob

# Test on test sentence
for s in test_data:
    print("Sentence:", " ".join(s))
    print("Unigram:", sentence_probability(s, "unigram"))
    print("Bigram:", sentence_probability(s, "bigram"))
    print("Trigram:", sentence_probability(s, "trigram"))


Sentence: <s> text preprocessing removes noise from raw data </s>
Unigram: 4.098039538740913e-15
Bigram: 8.075279144492282e-12
Trigram: 2.1803253690129166e-10


In [95]:
def perplexity(data, model):
    N = 0
    log_prob = 0

    for sentence in data:
        N += len(sentence)

        if model == "unigram":
            for w in sentence:
                log_prob += np.log(unigram_prob(w))

        elif model == "bigram":
            for i in range(len(sentence)-1):
                log_prob += np.log(bigram_prob(sentence[i], sentence[i+1]))

        elif model == "trigram":
            for i in range(len(sentence)-2):
                log_prob += np.log(trigram_prob(sentence[i], sentence[i+1], sentence[i+2]))

    return np.exp(-log_prob / N)

print("Unigram Perplexity:", perplexity(test_data, "unigram"))
print("Bigram Perplexity:", perplexity(test_data, "bigram"))
print("Trigram Perplexity:", perplexity(test_data, "trigram"))


Unigram Perplexity: 39.682831291441175
Bigram Perplexity: 17.08197377103495
Trigram Perplexity: 11.843979102308912
