# Probabilistic Language Models

N-gram for next Word Prediction: This is a very simple implementation using the frequency of n-grams to determine the next word.

* Step 1: Generate n-grams using CountVectorizer
* Step 2: Create a dictionary of n-gram frequencies
* Step 3: Predict the next word based on the highest frequency n-gram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

In [None]:
# Sample corpus

# sos: start of sentence
# eos: end of sentence
corpus = [
    "sos the quick brown fox jumps over the lazy dog eos",
    "sos the quick brown fox is very quick eos",
    "sos the quick brown fox jumps eos",
    "sos the lazy dog jumps over the quick fox eos"
]

In [None]:
# Configurable n-gram size
n = 3  # Adjust n here for different n-gram sizes (e.g., 2 for bigrams, 3 for trigrams)

# Step 1: Generate n-grams using CountVectorizer
vectorizer = CountVectorizer(ngram_range=(n, n))  # Generate n-grams of size n
X = vectorizer.fit_transform(corpus)
n_grams = vectorizer.get_feature_names_out()

print("*" * 10)
print(n_grams)
print(X.toarray())

# Step 2: Create a dictionary of n-gram frequencies
n_gram_freq = defaultdict(int)
for ngram, count in zip(n_grams, X.toarray().sum(axis=0)):
    n_gram_freq[ngram] = count

print("*" * 10)
print(n_gram_freq)    
    
# Step 3: Predict the next word based on the highest frequency n-gram
def predict_next_word(prefix, n_gram_freq, n):
    # Convert prefix to lowercase and strip any extra whitespace
    prefix = prefix.lower().strip()
    prefix_words = prefix.split()
    
    ### YOUR SOLUTION HERE
    # Ensure the prefix has exactly n-1 words
    # Find all n-grams that start with the given prefix
    # print candidates for debugging
    # If no candidates are found, return None
    # Find the most frequent n-gram and extract the next word
    ### END OF SOLUTION
    
    return next_word

# Example prediction
prefix = " the quick "  # For trigrams (n=3), the prefix should be two words
next_word = predict_next_word(prefix, n_gram_freq, n)
print(f"Predicted next word after '{prefix}': {next_word}")

In [None]:
# Example prediction for a longer sequence
c = 0

prefix = "the quick"
while prefix.split()[-1] != "eos" and c < 50:
    c += 1
    print(f"Round '{c}'")
    next_word = predict_next_word(prefix, n_gram_freq, n)
    print(f"Predicted next word after '{prefix}': {next_word}")
    if next_word == None:
        break
    new_prefix = prefix.split()[1:] #.append(next_word)
    new_prefix.append(next_word)
    prefix = " ".join(new_prefix) + " "
    print(prefix)