## Implemnting N-GRAM model using The Berkeley Restaurant Project (BeRP) Datasets 

In [1]:
import re
import pandas as pd
import numpy as np
from collections import defaultdict, Counter

### Reading the dataset from a file and transforming into a list

In [2]:
corpus_file = 'berp_transcript.txt'
corpus = []
try:
    with open(corpus_file, 'r', encoding='utf-8') as file:
        corpus = [line.strip() for line in file if line.strip()]
except FileNotFoundError:
    print(f"Error: File '{corpus_file}' not found. Please ensure it is in the same directory as this notebook.")
    exit(1)

In [3]:
corpus[0]

"33_1_0001 okay let's see i want to go to a thai restaurant . [uh] with less than ten dollars per person"

Implementing a simple bigram model

In [4]:

# Preprocess the corpus
def preprocess_sentence(sentence):
   # Remove identifier (e.g., "33_1_0001") from the start of the line
    sentence = re.sub(r'^\S+\s+', '', sentence)
    # Remove filler tokens [uh], [mm], and angle-bracketed tokens (e.g., <i>, <like>)
    sentence = re.sub(r'\[uh\]|\[mm\]', '', sentence)
    sentence = re.sub(r'<[^>]+>', '', sentence)
    # Convert to lowercase
    sentence = sentence.lower()
    # Replace punctuation and special characters (e.g., a__m, na-) with spaces
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    # Replace multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence)
    # Split into tokens
    tokens = sentence.strip().split()
    # Add sentence boundary tokens
    tokens = ['<s>'] + tokens + ['</s>']
    return tokens

In [5]:
# Process all sentences
processed_corpus = [preprocess_sentence(sentence) for sentence in corpus]
print(processed_corpus[0])

['<s>', 'okay', 'let', 's', 'see', 'i', 'want', 'to', 'go', 'to', 'a', 'thai', 'restaurant', 'with', 'less', 'than', 'ten', 'dollars', 'per', 'person', '</s>']


In [6]:
# Extract vocabulary (all unique words)
vocabulary = set()
for sentence in processed_corpus:
    vocabulary.update(sentence)
vocabulary = sorted(vocabulary)  # Sort for consistent table ordering

In [7]:
len(vocabulary)

1653

In [8]:
# Extract bigrams and count them
bigram_counts = defaultdict(int)
for sentence in processed_corpus:
    # Generate bigrams
    for i in range(len(sentence) - 1):
        w1, w2 = sentence[i], sentence[i + 1]
        bigram_counts[(w1, w2)] += 1

In [9]:
bigram_counts

defaultdict(int,
            {('<s>', 'okay'): 136,
             ('okay', 'let'): 20,
             ('let', 's'): 269,
             ('s', 'see'): 13,
             ('see', 'i'): 5,
             ('i', 'want'): 911,
             ('want', 'to'): 682,
             ('to', 'go'): 369,
             ('go', 'to'): 171,
             ('to', 'a'): 80,
             ('a', 'thai'): 24,
             ('thai', 'restaurant'): 36,
             ('restaurant', 'with'): 16,
             ('with', 'less'): 1,
             ('less', 'than'): 177,
             ('than', 'ten'): 99,
             ('ten', 'dollars'): 156,
             ('dollars', 'per'): 30,
             ('per', 'person'): 24,
             ('person', '</s>'): 26,
             ('<s>', 'i'): 2791,
             ('i', 'like'): 81,
             ('like', 'to'): 1182,
             ('to', 'eat'): 758,
             ('eat', 'at'): 41,
             ('at', 'lunch'): 7,
             ('lunch', 'time'): 8,
             ('time', 'so'): 2,
             ('so', 'that'): 

In [10]:
# Create the bigram counts matrix
matrix = {w1: {w2: 0 for w2 in vocabulary} for w1 in vocabulary}
for (w1, w2), count in bigram_counts.items():
    matrix[w1][w2] = count

In [11]:
matrix['</s>']

{'</s>': 0,
 '<s>': 0,
 '_i': 0,
 'a': 0,
 'a__m': 0,
 'able': 0,
 'abou': 0,
 'about': 0,
 'accommodate': 0,
 'account': 0,
 'ace': 0,
 'actual': 0,
 'actually': 0,
 'addison': 0,
 'addition': 0,
 'additional': 0,
 'address': 0,
 'ady': 0,
 'affordable': 0,
 'africa': 0,
 'african': 0,
 'after': 0,
 'afternoon': 0,
 'afterwards': 0,
 'agai': 0,
 'again': 0,
 'age': 0,
 'ah': 0,
 'ahead': 0,
 'albany': 0,
 'alcohol': 0,
 'alfredo': 0,
 'all': 0,
 'allow': 0,
 'allows': 0,
 'allston': 0,
 'alre': 0,
 'already': 0,
 'alright': 0,
 'also': 0,
 'although': 0,
 'always': 0,
 'am': 0,
 'amaru': 0,
 'american': 0,
 'amoun': 0,
 'amount': 0,
 'an': 0,
 'and': 0,
 'andean': 0,
 'ann': 0,
 'annex': 0,
 'anniversary': 0,
 'another': 0,
 'answer': 0,
 'antarctican': 0,
 'any': 0,
 'anybody': 0,
 'anymore': 0,
 'anyplace': 0,
 'anything': 0,
 'anytime': 0,
 'anyways': 0,
 'anywhere': 0,
 'appear': 0,
 'apple': 0,
 'appreciate': 0,
 'appropriate': 0,
 'approximately': 0,
 'arab': 0,
 'arabian': 0,
 

In [12]:
# Convert to pandas DataFrame for display
df = pd.DataFrame(matrix).T
# Reorder columns to match vocabulary

# Display the table
print("Bigram Counts Table (All Words in BeRP Subset):")
df

Bigram Counts Table (All Words in BeRP Subset):


Unnamed: 0,</s>,<s>,_i,a,a__m,able,abou,about,accommodate,account,...,yorkshire,yoshi,you,your,yourself,yuck,yuppie,zachary,zucchini,zza
</s>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
<s>,3,0,0,34,0,0,1,51,0,0,...,0,0,17,0,0,0,0,0,0,0
_i,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a__m,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yuck,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yuppie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zachary,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zucchini,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.to_csv('bigram_counts_all_words.csv')

In [66]:
matrix = {w1: {w2: 0 for w2 in vocabulary} for w1 in vocabulary}
for (w1, w2), count in bigram_counts.items():
    matrix[w1][w2] = count
    

In [15]:
all_tokens = []
for sentence in corpus:
    tokens = preprocess_sentence(sentence)
    all_tokens.extend(tokens)



In [16]:
all_tokens

['<s>',
 'okay',
 'let',
 's',
 'see',
 'i',
 'want',
 'to',
 'go',
 'to',
 'a',
 'thai',
 'restaurant',
 'with',
 'less',
 'than',
 'ten',
 'dollars',
 'per',
 'person',
 '</s>',
 '<s>',
 'i',
 'like',
 'to',
 'eat',
 'at',
 'lunch',
 'time',
 'so',
 'that',
 'would',
 'be',
 'eleven',
 'a__m',
 'to',
 'one',
 'p__m',
 '</s>',
 '<s>',
 'i',
 'don',
 't',
 'want',
 'to',
 'walk',
 'for',
 'more',
 'than',
 'five',
 'minutes',
 '</s>',
 '<s>',
 'tell',
 'me',
 'more',
 'about',
 'the',
 'na',
 'nakapan',
 'restaurant',
 'on',
 'martin',
 'luther',
 'king',
 '</s>',
 '<s>',
 'i',
 'like',
 'to',
 'go',
 'to',
 'a',
 'hamburger',
 'restaurant',
 '</s>',
 '<s>',
 'let',
 's',
 'start',
 'again',
 '</s>',
 '<s>',
 'i',
 'like',
 'to',
 'get',
 'a',
 'hamburger',
 'at',
 'an',
 'american',
 'restaurant',
 '</s>',
 '<s>',
 'i',
 'd',
 'like',
 'to',
 'eat',
 'dinner',
 'and',
 'i',
 'don',
 't',
 'mind',
 'walking',
 'for',
 'half',
 'an',
 'hour',
 '</s>',
 '<s>',
 'i',
 'don',
 't',
 'want'

In [17]:
# Count unigrams
unigram_counts = Counter(all_tokens)
unigram_counts

Counter({'<s>': 8566,
         '</s>': 8566,
         'i': 3844,
         'to': 2717,
         'like': 1523,
         'food': 1245,
         'about': 1154,
         'the': 1120,
         'a': 1073,
         'want': 1036,
         'me': 928,
         's': 834,
         'eat': 829,
         'would': 799,
         'restaurant': 754,
         'have': 743,
         'dollars': 707,
         'for': 673,
         'd': 667,
         'on': 632,
         'you': 569,
         'start': 549,
         'over': 523,
         'more': 510,
         'go': 500,
         'of': 486,
         'dinner': 474,
         'is': 459,
         'tell': 457,
         'can': 456,
         't': 455,
         'restaurants': 452,
         'some': 421,
         'any': 405,
         'it': 400,
         'than': 398,
         'lunch': 392,
         'in': 387,
         'be': 381,
         'and': 342,
         'm': 341,
         'ten': 334,
         'what': 321,
         'spend': 311,
         'do': 308,
         'information': 

In [18]:
all_counts_df = pd.DataFrame([unigram_counts], index=['Count']).T.sort_index()

In [19]:
all_counts_df

Unnamed: 0,Count
</s>,8566
<s>,8566
_i,1
a,1073
a__m,2
...,...
yuck,1
yuppie,1
zachary,9
zucchini,2


In [20]:
# P(i/<s>) = count(i/<s>)/count(<s>)
count_okay_given_s = bigram_counts[('<s>', 'okay')]

print("Count (okay/<s>)", count_okay_given_s)
print("Count (<s>)", unigram_counts['<s>'])
okay_appears_at_start = count_okay_given_s/ unigram_counts['<s>']

print("P(okay/<s>)", okay_appears_at_start)

Count (okay/<s>) 136
Count (<s>) 8566
P(okay/<s>) 0.015876721923885128


### Predicting next word

In [21]:
input_bigrams = {bigram: count for bigram, count in bigram_counts.items() if bigram[0] == 'to'}
input_bigrams

{('to', 'go'): 369,
 ('to', 'a'): 80,
 ('to', 'eat'): 758,
 ('to', 'one'): 6,
 ('to', 'walk'): 29,
 ('to', 'get'): 40,
 ('to', 'spend'): 233,
 ('to', 'drive'): 12,
 ('to', 'pay'): 32,
 ('to', 'um'): 5,
 ('to', 'have'): 358,
 ('to', 'i__c__s__i'): 22,
 ('to', 'be'): 65,
 ('to', 'find'): 35,
 ('to', 'vegetarian'): 2,
 ('to', 'try'): 18,
 ('to', 'see'): 30,
 ('to', 'travel'): 93,
 ('to', '</s>'): 6,
 ('to', 'sixteen'): 1,
 ('to', 'some'): 3,
 ('to', 'ten'): 32,
 ('to', 'twenty'): 23,
 ('to', 'do'): 2,
 ('to', 'fifteen'): 14,
 ('to', 'dinner'): 14,
 ('to', 'the'): 58,
 ('to', 'ea'): 1,
 ('to', 'friday'): 1,
 ('to', 'know'): 100,
 ('to', 'here'): 2,
 ('to', 'take'): 16,
 ('to', 'twelve'): 5,
 ('to', 'that'): 1,
 ('to', 'thirty'): 9,
 ('to', 'chinese'): 3,
 ('to', 'three'): 1,
 ('to', 'lunch'): 6,
 ('to', 'ic'): 1,
 ('to', 'pizza'): 1,
 ('to', 'an'): 28,
 ('to', 'icksee'): 24,
 ('to', 'just'): 1,
 ('to', 'shell'): 1,
 ('to', 'about'): 2,
 ('to', 'kind'): 1,
 ('to', 'my'): 1,
 ('to', 'this'):

In [22]:
unigram_counts['to']

2717

In [23]:
prob = []
for bigram, count in input_bigrams.items():
    compute_probability = count/unigram_counts[bigram[0]]
    prob.append((bigram, (count, compute_probability)))

sorted_prob = sorted(prob, key=lambda x: x[1][0], reverse=True)


sorted_prob


[(('to', 'eat'), (758, 0.2789841737210158)),
 (('to', 'go'), (369, 0.13581155686418844)),
 (('to', 'have'), (358, 0.13176297386823702)),
 (('to', 'spend'), (233, 0.08575634891424365)),
 (('to', 'know'), (100, 0.0368052999631947)),
 (('to', 'travel'), (93, 0.03422892896577107)),
 (('to', 'a'), (80, 0.02944423997055576)),
 (('to', 'be'), (65, 0.023923444976076555)),
 (('to', 'the'), (58, 0.021347073978652927)),
 (('to', 'get'), (40, 0.01472211998527788)),
 (('to', 'find'), (35, 0.012881854987118146)),
 (('to', 'pay'), (32, 0.011777695988222304)),
 (('to', 'ten'), (32, 0.011777695988222304)),
 (('to', 'see'), (30, 0.01104158998895841)),
 (('to', 'walk'), (29, 0.010673536989326464)),
 (('to', 'an'), (28, 0.010305483989694516)),
 (('to', 'icksee'), (24, 0.008833271991166729)),
 (('to', 'twenty'), (23, 0.008465218991534781)),
 (('to', 'i__c__s__i'), (22, 0.008097165991902834)),
 (('to', 'try'), (18, 0.006624953993375046)),
 (('to', 'take'), (16, 0.005888847994111152)),
 (('to', 'fifteen'), (

In [24]:
most_likely_words = [item[0][1] for item in sorted_prob][:4]
most_likely_words

['eat', 'go', 'have', 'spend']

### Putting the bigram model together

In [25]:
def predict_next_words(input_word, bigram_counts, unigram_counts, vocabulary_size, top_n=4):
    """
    Predicts the most likely next words based on bigram probabilities.
    
    Args:
        input_word (str): The word to find predictions for
        bigram_counts (dict): Dictionary with bigram tuples as keys and counts as values
        unigram_counts (dict): Dictionary with words as keys and counts as values
        top_n (int): Number of top predictions to return (default: 4)
    
    Returns:
        list: List of most likely next words, sorted by probability (descending)
        
    Example:
        >>> predict_next_words('to', bigram_counts, unigram_counts, top_n=3)
        ['get', 'go', 'a']
    """
    
    # Step 1: Filter bigrams that start with the input word
    matching_bigrams = {
        bigram: count 
        for bigram, count in bigram_counts.items() 
        if bigram[0] == input_word
    }
    
    # Return empty list if no bigrams found for the input word
    if not matching_bigrams:
        return []
    
    # Step 2: Calculate probabilities for each matching bigram
    bigram_probabilities = []
    for bigram, count in matching_bigrams.items():
        # Probability = bigram_count / unigram_count_of_first_word
        # Laplace add-one smoothing
        probability = (count + 1) / (unigram_counts[bigram[0]] + vocabulary_size)
        bigram_probabilities.append((bigram, (count, probability)))
    
    # Step 3: Sort by count (descending order)
    sorted_predictions = sorted(
        bigram_probabilities, 
        key=lambda x: x[1][0],  # Sort by count (x[1][0])
        reverse=True
    )
    
    # Step 4: Extract the second words (predictions) and return top N
    predicted_words = [item[0][1] for item in sorted_predictions][:top_n]
    
    return predicted_words


In [26]:
predict_next_words('eat', bigram_counts, unigram_counts, vocabulary_size=len(vocabulary))

['on', 'some', 'lunch', 'dinner']

In [27]:
def tokenize_input(text):
    """
    Tokenizes user input by splitting on spaces and commas, and cleaning tokens.
    
    Args:
        text (str): Raw user input
        
    Returns:
        list: List of cleaned tokens
    """
    import re
    
    # Replace commas with spaces and split
    text = text.replace(',', ' ')
    
    # Split on whitespace and filter out empty strings
    tokens = [token.strip() for token in text.split() if token.strip()]
    
    # Clean tokens: remove punctuation, convert to lowercase
    cleaned_tokens = []
    for token in tokens:
        # Remove punctuation and convert to lowercase
        cleaned_token = re.sub(r'[^\w]', '', token.lower())
        if cleaned_token:  # Only add non-empty tokens
            cleaned_tokens.append(cleaned_token)
    
    return cleaned_tokens

In [28]:
def interactive_word_predictor(bigram_counts, unigram_counts, vocabulary_size, top_n=4):
    """
    Interactive word prediction system that prompts user for input and predicts next words.
    
    Args:
        bigram_counts (dict): Dictionary with bigram tuples as keys and counts as values
        unigram_counts (dict): Dictionary with words as keys and counts as values
        top_n (int): Number of top predictions to return (default: 4)
    """
    
    print("=== INTERACTIVE WORD PREDICTOR ===")
    print("Enter text (words separated by spaces or commas)")
    print("Press Enter to get predictions for the next word")
    print("Type 'quit' or 'exit' to stop\n")
    
    while True:
        # Get user input
        user_input = input("Enter your text: ").strip()
        
        # Check for quit commands
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        
        # Handle empty input
        if not user_input:
            print("Please enter some text.\n")
            continue
        
        # Tokenize the input
        tokens = tokenize_input(user_input)
        
        if not tokens:
            print("No valid tokens found. Please try again.\n")
            continue
        
        # Get the last token for prediction
        last_word = tokens[-1]
        
        print(f"Input tokens: {tokens}")
        print(f"Predicting next word after: '{last_word}'")
        
        # Get predictions
        predictions = predict_next_words(last_word, bigram_counts, unigram_counts, vocabulary_size, top_n)
        
        # Display results
        if predictions:
            print(f"Most likely next words: {predictions}")
        else:
            print(f"No predictions found for '{last_word}' (word not in training data)")
        
        print("-" * 50)

In [67]:
interactive_word_predictor(bigram_counts, unigram_counts, vocabulary_size=len(vocabulary))

=== INTERACTIVE WORD PREDICTOR ===
Enter text (words separated by spaces or commas)
Press Enter to get predictions for the next word
Type 'quit' or 'exit' to stop

Input tokens: ['indian']
Predicting next word after: 'indian'
Most likely next words: ['food', 'restaurants', 'restaurant', '</s>']
--------------------------------------------------
Input tokens: ['indian']
Predicting next word after: 'indian'
Most likely next words: ['food', 'restaurants', 'restaurant', '</s>']
--------------------------------------------------
Please enter some text.

Goodbye!


### Bigram Model Sentence Sampling

In [61]:
# Compute Laplace-smoothed bigram probabilities
bigram_probs = defaultdict(dict)
for w1 in vocabulary:
    for w2 in vocabulary:
        # Laplace smoothing: P(w2 | w1) = (C(w1 w2) + 1) / (C(w1) + V)
        bigram_probs[w1][w2] = (bigram_counts[(w1, w2)] + 0.01) / (unigram_counts[w1] + (0.01*len(vocabulary)))

In [62]:
bigram_probs['</s>']

{'</s>': 1.165157593390294e-06,
 '<s>': 1.165157593390294e-06,
 '_i': 1.165157593390294e-06,
 'a': 1.165157593390294e-06,
 'a__m': 1.165157593390294e-06,
 'able': 1.165157593390294e-06,
 'abou': 1.165157593390294e-06,
 'about': 1.165157593390294e-06,
 'accommodate': 1.165157593390294e-06,
 'account': 1.165157593390294e-06,
 'ace': 1.165157593390294e-06,
 'actual': 1.165157593390294e-06,
 'actually': 1.165157593390294e-06,
 'addison': 1.165157593390294e-06,
 'addition': 1.165157593390294e-06,
 'additional': 1.165157593390294e-06,
 'address': 1.165157593390294e-06,
 'ady': 1.165157593390294e-06,
 'affordable': 1.165157593390294e-06,
 'africa': 1.165157593390294e-06,
 'african': 1.165157593390294e-06,
 'after': 1.165157593390294e-06,
 'afternoon': 1.165157593390294e-06,
 'afterwards': 1.165157593390294e-06,
 'agai': 1.165157593390294e-06,
 'again': 1.165157593390294e-06,
 'age': 1.165157593390294e-06,
 'ah': 1.165157593390294e-06,
 'ahead': 1.165157593390294e-06,
 'albany': 1.165157593390

In [63]:
# Function to sample a sentence
def sample_sentence():
    sentence = ['<s>']
    while True:
        current_word = sentence[-1]
        # Get probability distribution for next word
        probs = [bigram_probs[current_word][w] for w in vocabulary]
        # Sample next word based on probabilities
        next_word = np.random.choice(vocabulary, p=probs)
        sentence.append(next_word)
        if next_word == '</s>':
            break
    return sentence

In [65]:
# Generate and display 5 sampled sentences
print("Sampled Sentences from Bigram Model (Laplace Smoothed):")
for i in range(5):
    sentence = sample_sentence()
    # Join words, excluding <s> and </s> for readability
    sentence_text = ' '.join(word for word in sentence[1:-1] if word not in ['<s>', '</s>'])
    print(f"Sentence {i+1}: {sentence_text}")

Sampled Sentences from Bigram Model (Laplace Smoothed):
Sentence 1: no more about oriental food
Sentence 2: maybe a chinese food
Sentence 3: show me more expensive restaurants
Sentence 4: howbout a car
Sentence 5: no more than ten dollars


### Trigram Model Sampling

In [34]:
# Preprocess trigram sentence
def preprocess_trigram_sentence(sentence):
    # Remove identifier (e.g., "33_1_0001")
    sentence = re.sub(r'^\S+\s+', '', sentence)
    # Remove filler tokens [uh], [mm], and angle-bracketed tokens (e.g., <i>, <like>)
    sentence = re.sub(r'\[uh\]|\[mm\]', '', sentence)
    sentence = re.sub(r'<[^>]+>', '', sentence)
    # Convert to lowercase
    sentence = sentence.lower()
    # Replace punctuation and special characters (e.g., a__m, na-) with spaces
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    # Replace multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence)
    # Split into tokens and add sentence boundary tokens
    tokens = sentence.strip().split()
    tokens = ['<s>', '<s>'] + tokens + ['</s>']
    return tokens

In [35]:
# Process all sentences
processed_trigram_corpus = [preprocess_trigram_sentence(sentence) for sentence in corpus]

In [36]:
# Extract trigram vocabulary (all unique words, including <s>, </s>)
vocabulary_trigram = set()
for sentence in processed_trigram_corpus:
    vocabulary_trigram.update(sentence)
vocabulary_trigram = sorted(vocabulary_trigram)  # Sort for consistency
V = len(vocabulary_trigram)  # Vocabulary size

In [42]:
# Count unigrams, bigrams, and trigrams
unigram_counts = Counter()
bigram_counts = defaultdict(int)
trigram_counts = defaultdict(int)
total_tokens = 0
for sentence in processed_trigram_corpus:
    unigram_counts.update(sentence)
    total_tokens += len(sentence)
    for i in range(len(sentence) - 1):
        w1, w2 = sentence[i], sentence[i + 1]
        bigram_counts[(w1, w2)] += 1
    for i in range(len(sentence) - 2):
        w1, w2, w3 = sentence[i], sentence[i + 1], sentence[i + 2]
        trigram_counts[(w1, w2, w3)] += 1

In [None]:
# trigram_counts

defaultdict(int,
            {('<s>', '<s>', 'okay'): 136,
             ('<s>', 'okay', 'let'): 17,
             ('okay', 'let', 's'): 20,
             ('let', 's', 'see'): 13,
             ('s', 'see', 'i'): 4,
             ('see', 'i', 'want'): 2,
             ('i', 'want', 'to'): 584,
             ('want', 'to', 'go'): 138,
             ('to', 'go', 'to'): 147,
             ('go', 'to', 'a'): 72,
             ('to', 'a', 'thai'): 5,
             ('a', 'thai', 'restaurant'): 21,
             ('thai', 'restaurant', 'with'): 1,
             ('restaurant', 'with', 'less'): 1,
             ('with', 'less', 'than'): 1,
             ('less', 'than', 'ten'): 48,
             ('than', 'ten', 'dollars'): 46,
             ('ten', 'dollars', 'per'): 2,
             ('dollars', 'per', 'person'): 23,
             ('per', 'person', '</s>'): 21,
             ('<s>', '<s>', 'i'): 2791,
             ('<s>', 'i', 'like'): 66,
             ('i', 'like', 'to'): 50,
             ('like', 'to', 'eat'): 46

Memory expensive algorithm, don't try

In [38]:
# Compute Laplace-smoothed trigram probabilities
# trigram_probs = defaultdict(lambda: defaultdict(dict))
# for w1 in vocabulary_trigram:
#     for w2 in vocabulary_trigram:
#         for w3 in vocabulary_trigram:
#             # Laplace smoothing: P(w3 | w1, w2) = (C(w1 w2 w3) + 1) / (C(w1 w2) + V)
#             trigram_probs[w1][w2][w3] = (trigram_counts[(w1, w2, w3)] + 1) / (bigram_counts[(w1, w2)] + V)

In [39]:
# Interpolation parameters
lambda_3 = 0.6  # Weight for trigram
lambda_2 = 0.3  # Weight for bigram
lambda_1 = 0.1  # Weight for unigram
k = 0.01  # Add-k smoothing parameter (smaller than Laplace's k=1)


In [50]:
# Compute probabilities on-the-fly during sampling
def get_trigram_probability(w3, w1, w2):
    # Trigram: P(w3 | w1, w2) = (C(w1 w2 w3) + k) / (C(w1 w2) + k*V)
    trigram_prob = (trigram_counts[(w1, w2, w3)] + k) / (bigram_counts[(w1, w2)] + k * V)
    # Bigram: P(w3 | w2) = (C(w2 w3) + k) / (C(w2) + k*V)
    bigram_prob = (bigram_counts[(w2, w3)] + k) / (unigram_counts[w2] + k * V)
    # Unigram: P(w3) = (C(w3) + k) / (N + k*V)
    unigram_prob = (unigram_counts[w3] + k) / (total_tokens + k * V)
    # Interpolated probability
    # return lambda_3 * trigram_prob + lambda_2 * bigram_prob + lambda_1 * unigram_prob
    return trigram_prob

In [None]:
# Function to sample a trigram sentence
def sample_trigram_sentence():
    sentence = ['<s>', '<s>']
    while True:
        w1, w2 = sentence[-2], sentence[-1]
        # Compute probabilities for all possible next words
        probs = [get_trigram_probability(w3, w1, w2) for w3 in vocabulary_trigram]
        # Normalize to ensure sum=1 (due to floating-point precision)
        probs = np.array(probs) / np.sum(probs)
        # Sample next word
        next_word = np.random.choice(vocabulary_trigram, p=probs)
        sentence.append(next_word)
        if next_word == '</s>':
            break
    return sentence


In [51]:
print("Sampled Sentences from Trigram Model (Interpolated, Add-k Smoothing):")
for i in range(5):
    sentence = sample_trigram_sentence()
    sentence_text = ' '.join(word for word in sentence[2:-1] if word not in ['<s>', '</s>'])
    print(f"Trigram Sentence {i+1}: {sentence_text}")

Sampled Sentences from Trigram Model (Interpolated, Add-k Smoothing):


Trigram Sentence 1: i don t care what case c chance ter disappointed easier deli spicy wouldn four eating madras thanks pickled answer brazil listen delis renaissance wouldn states horrible particularly gross coffee expand th i__c__s limit four viva horrible like lower suggest ll waitresses maximally spend print hungarian bad rose lunch thousand sine heike aurants walk jones sleazy wh eighteen japanese china appropriate dancing zachary meters mar san__francisco californian alright de persons surrounding sizzler restauran tell nt extend expensive chow drink european mall tes out beer croissants yet delhi goodbye budget kay beer screwed ple hills hello tour want alre found about c__s__i ye i__c__ same cerrito thirteen brazil house thought point interested cancel weekdays _i paz thing noticing bastille dressy prosperous unimportant locations wedding sugar ch coupla anybody breakfast fanny taiwan hearst displaying lox carl feature father thirteen house easter afterwards jamaican days meeti

In [56]:
# Function to sample a bigram sentence (for comparison)
def sample_bigram_sentence():
    sentence = ['<s>']
    while True:
        w1 = sentence[-1]
        probs = [(bigram_counts[(w1, w2)] + k) / (unigram_counts[w1] + k * V) for w2 in vocabulary_trigram]
        probs = np.array(probs) / np.sum(probs)
        next_word = np.random.choice(vocabulary_trigram, p=probs)
        sentence.append(next_word)
        if next_word == '</s>':
            break
    return sentence

In [57]:
print("\nSampled Sentences from Bigram Model (Add-k Smoothing):")
for i in range(5):
    sentence = sample_bigram_sentence()
    sentence_text = ' '.join(word for word in sentence[1:-1] if word not in ['<s>', '</s>'])
    print(f"Bigram Sentence {i+1}: {sentence_text}")


Sampled Sentences from Bigram Model (Add-k Smoothing):
Bigram Sentence 1: start over
Bigram Sentence 2: just go to eat on a sandwich
Bigram Sentence 3: could be wednesday
Bigram Sentence 4: start over noise cheval
Bigram Sentence 5: show me more than seven shouldn green peru louder hot serving exican university away
