In [None]:
import math
from nltk import bigrams, trigrams
from nltk.tokenize import word_tokenize
from collections import Counter
import wikipedia

# Fetch a Wikipedia page
page = wikipedia.page("Natural_language_processing")
text = page.content[:1000]  # Limit to 1000 characters

print("Wikipedia excerpt (first 300 chars):")
print(text[:300])

# ----------------------------
# BIGRAM MODEL
# ----------------------------
def bigram_probabilities(text):
    tokens = word_tokenize(text.lower())
    bigram_counts = Counter(bigrams(tokens))
    unigram_counts = Counter(tokens)
    
    # Calculate probability P(w2|w1) = count(w1, w2) / count(w1)
    bigram_probs = {bigram: count / unigram_counts[bigram[0]]
                    for bigram, count in bigram_counts.items()}
    return bigram_probs

wiki_bigram = bigram_probabilities(text)

def predict_next_word(bigram_probs, current_word):
    # Retrieve candidates that match the current word as the first in the bigram
    candidates = { k[1]: v for k, v in bigram_probs.items() if k[0] == current_word }
    if not candidates:
        return None  # if no candidates, return None
    
    # Return the candidate with the highest probability
    return max(candidates, key=candidates.get)

predicted_word = predict_next_word(wiki_bigram, "barks")
print(f"Predicted next word after 'barks' (bigram): {predicted_word}")

# ----------------------------
# TRIGRAM MODEL
# ----------------------------
def trigram_probabilities(text):
    tokens = word_tokenize(text.lower())
    trigram_counts = Counter(trigrams(tokens))
    bigram_counts = Counter(bigrams(tokens))
    
    # Calculate probability P(w3|w1, w2) = count(w1, w2, w3) / count(w1, w2)
    trigram_probs = { (w1, w2, w3): count / bigram_counts[(w1, w2)]
                      for (w1, w2, w3), count in trigram_counts.items() }
    return trigram_probs

wiki_trigram = trigram_probabilities(text)

def predict_next_word_trigram(trigram_probs, context):
    """
    Predicts the next word based on a 2-word context tuple.
    context: a tuple (word1, word2)
    """
    # Retrieve candidates that match the context (first two words of the trigram)
    candidates = { trigram[2]: prob for trigram, prob in trigram_probs.items() 
                   if trigram[0] == context[0] and trigram[1] == context[1] }
    if not candidates:
        return None
    
    # Return the candidate with the highest probability
    return max(candidates, key=candidates.get)

predicted_trigram_word = predict_next_word_trigram(wiki_trigram, ("natural", "language"))
print(f"Predicted next word after 'natural language' (trigram): {predicted_trigram_word}")

# ----------------------------
# PERPLEXITY FUNCTIONS
# ----------------------------
def perplexity_bigram(bigram_probs, sentence, smoothing=1e-6):
    """
    Compute perplexity for a given sentence using the bigram model.
    """
    tokens = word_tokenize(sentence.lower())
    N = len(tokens)
    log_prob = 0
    # For each bigram in the sentence, compute the log probability
    for i in range(1, N):
        bigram = (tokens[i-1], tokens[i])
        prob = bigram_probs.get(bigram, smoothing)
        log_prob += math.log(prob)
    avg_log_prob = log_prob / (N - 1) if N > 1 else log_prob
    perplexity = math.exp(-avg_log_prob)
    return perplexity

def perplexity_trigram(trigram_probs, sentence, smoothing=1e-6):
    """
    Compute perplexity for a given sentence using the trigram model.
    """
    tokens = word_tokenize(sentence.lower())
    N = len(tokens)
    log_prob = 0
    count = 0
    # For each trigram in the sentence, compute the log probability
    for i in range(2, N):
        trigram = (tokens[i-2], tokens[i-1], tokens[i])
        prob = trigram_probs.get(trigram, smoothing)
        log_prob += math.log(prob)
        count += 1
    avg_log_prob = log_prob / count if count > 0 else log_prob
    perplexity = math.exp(-avg_log_prob)
    return perplexity

# ----------------------------
# TEST SENTENCES
# ----------------------------
# Updated test sentence designed to be more similar to the NLP domain.
# This sentence uses vocabulary and structure likely to be present in the training corpus.
test_sentence = "Natural language processing enables computers to analyze and understand human language."
bigram_perplexity = perplexity_bigram(wiki_bigram, test_sentence)
trigram_perplexity = perplexity_trigram(wiki_trigram, test_sentence)

print("\nPerplexity on updated test sentence:")
print(f"Bigram Perplexity: {bigram_perplexity}")
print(f"Trigram Perplexity: {trigram_perplexity}")
