In [2]:
from collections import Counter

In [3]:

def unigram_prob(word, corpus):
    tokens = [word for sentence in corpus for word in sentence.split()]
    unigram_counts = Counter(tokens)
    N = len(tokens)
    return unigram_counts[word] / N

corpus = [
    "the quick brown fox jumps over the lazy dog"
]

# Calculate P("the")
prob_the = unigram_prob("quick", corpus)
print(f"P('quick') = {prob_the:.2f}")

P('quick') = 0.11


In [4]:
def extract_bigrams(sentence):
    words = sentence.split()
    return list(zip(words[:-1], words[1:]))

sentence = "<s> I will be doing my project soon </s>"
bigrams = extract_bigrams(sentence)
print(bigrams)

[('<s>', 'I'), ('I', 'will'), ('will', 'be'), ('be', 'doing'), ('doing', 'my'), ('my', 'project'), ('project', 'soon'), ('soon', '</s>')]


In [5]:
def conditional_prob_bigram(w2, w1, bigrams, unigram_counts, laplace=True):
    bigram = (w1, w2)
    V = len(unigram_counts)  # Vocabulary size
    count_bigram = bigrams.get(bigram, 0)
    count_w1 = unigram_counts.get(w1, 0)
    
    if laplace:
        return (count_bigram + 1) / (count_w1 + V)
    else:
        return count_bigram / count_w1 if count_w1 != 0 else 0

all_tokens = [token for sentence in corpus for token in sentence.split()]

unigram_counts = Counter(all_tokens)
bigrams_corpus = [extract_bigrams(sentence) for sentence in corpus]
bigram_counts = Counter(b for sentence_bigrams in bigrams_corpus for b in sentence_bigrams)

# word = input() //Tried multiple things here, seems to be working as expected
prob_cat_given_the = conditional_prob_bigram("only", "is", bigram_counts, unigram_counts)
print(f"P('project'|'the') = {prob_cat_given_the:.2f}") 

P('project'|'the') = 0.12


In [6]:
def predict_next_word(w1, bigrams, unigram_counts):
    candidates = [bigram[1] for bigram in bigrams if bigram[0] == w1]
    if not candidates:
        return None
    return max(candidates, key=lambda x: bigrams.get((w1, x), 0))

next_word = predict_next_word("lazy", bigram_counts, unigram_counts)
print(f"Next word after 'the': {next_word}")

Next word after 'the': dog


In [7]:
def predict_sentence(initial_phrase, bigram_counts, unigram_counts, max_length=10):
    words = initial_phrase.split()
    while len(words) < max_length:
        last_word = words[-1]
        next_word = predict_next_word(last_word, bigram_counts, unigram_counts)
        if not next_word or next_word == "</s>":
            break
        words.append(next_word)
    return " ".join(words)

initial_phrase = "brown fox"
generated_sentence = predict_sentence(initial_phrase, bigram_counts, unigram_counts)
print(generated_sentence)

brown fox jumps over the quick brown fox jumps over


In [8]:
def extract_trigrams(sentence):
    words = sentence.split()
    return list(zip(words[:-2], words[1:-1], words[2:]))

In [9]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6
Note: you may need to restart the kernel to use updated packages.


In [10]:
import nltk
nltk.download("brown")
nltk.download("webtext")
nltk.download("reuters")
nltk.download('punkt_tab')
from nltk.corpus import brown, webtext, reuters


[nltk_data] Downloading package brown to /home/jovyan/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package webtext to /home/jovyan/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package reuters to /home/jovyan/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
brown_corpus = brown.sents()
brown_corpus = [" ".join(sentence) for sentence in brown_corpus]
brown_corpus = ["<s> " + sentence + " </s>" for sentence in brown_corpus][:5000]
webtext_corpus = webtext.sents()
webtext_corpus = [" ".join(sentence) for sentence in webtext_corpus]
webtext_corpus = ["<s> " + sentence + " </s>" for sentence in webtext_corpus][:5000]
reuters_corpus = reuters.sents()
reuters_corpus = [" ".join(sentence) for sentence in reuters_corpus]
reuters_corpus = ["<s> " + sentence + " </s>" for sentence in reuters_corpus][:5000]

In [18]:
import math

#I made this generixc function which handles any ngram so that I don't have t write bi or trigram funtion seperately.
def extract_ngrams(sentence, n=2):
    tokens = sentence.split()
    return list(zip(*[tokens[i:] for i in range(n)]))

def build_ngram_model(corpus, n=2):
    ngram_counts = Counter()
    for sentence in corpus:
        ngram_counts.update(extract_ngrams(sentence, n))
    return ngram_counts


bigrams_brown = build_ngram_model(brown_corpus, 2)
trigrams_brown = build_ngram_model(brown_corpus, 3)
bigrams_webtext = build_ngram_model(webtext_corpus, 2)
trigrams_webtext = build_ngram_model(webtext_corpus, 3)
bigrams_reuters = build_ngram_model(reuters_corpus, 2)
trigrams_reuters = build_ngram_model(reuters_corpus, 3)


def conditional_prob(w_given, context, ngram_counts, prefix_counts, vocab_size, laplace=True):
    count_ngram = ngram_counts.get(context + (w_given,), 0)
    count_prefix = prefix_counts.get(context, 0)
    
    if laplace:
        return (count_ngram + 1) / (count_prefix + vocab_size)
    return count_ngram / count_prefix if count_prefix != 0 else 0


def calculate_perplexity(name, sentence, ngram_counts, prefix_counts, vocab_size, n=2):
    """Calculate perplexity for a given sentence without using logarithms."""
    tokens = sentence.split()
    num_tokens = len(tokens)
    prob_product = 1.0 
    
    for i in range(num_tokens):
        context = tuple(tokens[max(0, i - n + 1):i])

        prob = conditional_prob(tokens[i], context, ngram_counts, prefix_counts, vocab_size)
        
        # Avoiding multiplication by zero by adding a small smoothing factor
        prob_product *= (prob + 1e-10)

    # Compute perplexity directly using n-th root
    perplexity = prob_product ** (-1 / num_tokens)
    
    return perplexity


if __name__ == "__main__":
    brown_unigrams = Counter(token for sent in brown_corpus for token in sent.split())
    webtext_unigrams = Counter(token for sent in webtext_corpus for token in sent.split())
    
    test_sentences = brown_corpus[10:15]    

    bg_ppl = [calculate_perplexity("brownBG", sent, bigrams_brown, brown_unigrams, 
                                  len(brown_unigrams), 2) for sent in test_sentences]
    avg_bg_ppl = sum(bg_ppl)/len(bg_ppl)
    

    tg_ppl = [calculate_perplexity("brownTG",sent, trigrams_brown, bigrams_brown,
                                  len(brown_unigrams), 3) for sent in test_sentences]
    avg_tg_ppl = sum(tg_ppl)/len(tg_ppl)
    
    print("\nQuestion 1 Results:")
    print(f"Brown Bigram Avg Perplexity: {avg_bg_ppl:.2f}")
    print(f"Brown Trigram Avg Perplexity: {avg_tg_ppl:.2f}")
    print("Expected Trigram perplexity to be lower, but it might be higher due to data sparsity")
    

    
    reuters_unigrams = Counter(token for sent in brown_corpus for token in sent.split())    
    test_sentences_reuters = reuters_corpus[30:55]    

    bg_ppl_reuters = [calculate_perplexity("webtextBG", sent, bigrams_webtext, webtext_unigrams, 
                                  len(webtext_unigrams), 2) for sent in test_sentences_reuters]
    avg_bg_ppl_reuters = sum(bg_ppl_reuters)/len(bg_ppl_reuters)
    

    tg_ppl_reuters = [calculate_perplexity("webtextTG",sent, trigrams_webtext, bigrams_webtext,
                                  len(webtext_unigrams), 3) for sent in test_sentences_reuters]
    avg_tg_ppl_reuters = sum(tg_ppl_reuters)/len(tg_ppl_reuters)
    

    print("\nQuestion 2 Results:")
    print(f"Webtext Bigram Avg PPL on Reuters: {avg_bg_ppl_reuters:.2f}")
    print(f"Webtext Trigram Avg PPL on Reuters: {avg_tg_ppl_reuters:.2f}")
    
    
    
    bg_ppl_reuters_b = [calculate_perplexity("brownBG", sent, bigrams_brown, brown_unigrams, 
                                  len(brown_unigrams), 2) for sent in test_sentences_reuters]
    avg_bg_ppl_reuters_b = sum(bg_ppl_reuters_b)/len(bg_ppl_reuters_b)
    

    tg_ppl_reuters_b = [calculate_perplexity("brownTG",sent, trigrams_brown, bigrams_brown,
                                  len(brown_unigrams), 3) for sent in test_sentences_reuters]
    avg_tg_ppl_reuters_b = sum(tg_ppl_reuters_b)/len(tg_ppl_reuters_b)
    

    print("\nQuestion 2 Results:")
    print(f"Brown Bigram Avg PPL on Reuters: {avg_bg_ppl_reuters_b:.2f}")
    print(f"Brown Trigram Avg PPL on Reuters: {avg_tg_ppl_reuters_b:.2f}")


    
    print("\nExample Sentence Generation (Brown Trigrams):")
    print(predict_sentence("justice", trigrams_brown, bigrams_brown,
                          len(brown_unigrams)))
    
    print("\nExample Sentence Generation (Brown Bigrams):")
    print(predict_sentence("A man", bigrams_brown, brown_unigrams,
                          len(brown_unigrams)))


Question 1 Results:
Brown Bigram Avg Perplexity: 2236.17
Brown Trigram Avg Perplexity: 6643.72
Expected Trigram perplexity to be lower, but it might be higher due to data sparsity

Question 2 Results:
Webtext Bigram Avg PPL on Reuters: 5978.34
Webtext Trigram Avg PPL on Reuters: 11170.29

Question 2 Results:
Brown Bigram Avg PPL on Reuters: 6529.01
Brown Trigram Avg PPL on Reuters: 13575.60

Example Sentence Generation (Brown Trigrams):
justice of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .

Example Sentence Generation (Brown Bigrams):
A man , and the first time .


**Questions**

1. What do you expect the difference between the Brown bigram and trigram models to look like? Which model
will provide you with more coherent text? How will the perplexity of each compare? You should test your
predictor and perplexity function using the brown_bigrams and brown_trigrams to confirm your expectations. For perplexity, an average over 2-5 sentences from the Brown corpus should be fine, but make sure you
use the same sentences both times. If something you did not expect occurs, explain what happened and why you
believe it happened.

**Answer:** As per the theoratical study and the the concepts which I understood in class, trigrams should generate more relatable and readable sentences given a corpus. The trigram model should give us the more coherent text.
I have tested my predictor fucntion and perplexity function using the brown_bigrams and brown_trigrams (5 random sentences taken from the corpus and kept same for both, trigrams and bigrams model) from the brown text corpus. I had expected that my model should have less perplexity for the trigrams model but the result said otherwise. I could see the bigram perplexity was lower than the trigram, which was unexpected.
In a way I could make sense of it as, the corpus (5 lines) are too low for the trigrams model to given proper perplexity but having three set of words appear in 5 sentences has a low probability in this small samle space. As perplexity is inversely proportional to the Probability of the occuring words, the perplexity increased for trigrams.

2. When testing our bigram models on the Reuters data, do you think a model trained on Brown or Webtext will
perform best? Pick any 25 sentences from the Reuters corpus and calculate the average perplexity using each
of your bigram datasets. Compare the results of each and provide explanation as to why you believe that one
performed better than the other.

**Answer:** When testing bigram models trained on the Brown and Webtext corpora on Reuters sentences, the Webtext model performed better, yielding a lower perplexity score on average. This indicates that Webtext's structure and vocabulary align more closely with Reuters' modern financial news style, whereas the Brown corpus contains older, more formal language that does not generalize as well.
Comparing the performance results:

Webtext contains more modern terms, making it more suited to Reuters’ language which in general led to overlapping vocabulary.

Financial and news-related text structures align better with Webtext than Brown.

The Brown corpus includes legal, literary, and outdated news styles, which do not suit well to Reuters' financial news domain.

3. When predicting the next word in a sentence, what do you believe would happen if we increased the number of
sentences in our training data?

**Answer:** Increasing the number of sentences in the training data would improve next-word prediction by providing better probability estimates and reducing perplexity of the trigram and bigram models both. With more data, the model learns a wider range of word combinations, making predictions more accurate and reducing the chances of encountering unknown sequences. It will also help capture rare words and domain-specific terms, leading to a more balanced and natural output. 

# References Used for this assignment

1. https://web.stanford.edu/~jurafsky/slp3/3.pdf
2. Lectrue Slides
3. https://training.continuumlabs.ai/data/datasets/what-is-perplexity