Bigram & Trigram Language Model

Import and Download Corpus

In [1]:
import nltk
nltk.download('brown')
nltk.download('punkt')

from nltk.corpus import brown
from collections import Counter


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Load Words + Create N-grams

In [2]:
# Load words from Brown corpus
words = brown.words()

# Unigram counts
unigram_counts = Counter(words)

# Bigram list and counts
bigrams = list(nltk.bigrams(words))
bigram_counts = Counter(bigrams)

# Trigram list and counts
trigrams = list(nltk.trigrams(words))
trigram_counts = Counter(trigrams)


Bigram Probability Function

In [3]:
def bigram_prob(w1, w2):
    """
    P(w2 | w1) = count(w1, w2) / count(w1)
    """
    if unigram_counts[w1] == 0:
        return 0
    return bigram_counts[(w1, w2)] / unigram_counts[w1]


Trigram Probability

In [4]:
def trigram_prob(w1, w2, w3):
    """
    P(w3 | w1, w2) = count(w1, w2, w3) / count(w1, w2)
    """
    if bigram_counts[(w1, w2)] == 0:
        return 0
    return trigram_counts[(w1, w2, w3)] / bigram_counts[(w1, w2)]


Test the Model

In [5]:
print("Bigram P('in' | 'the'):", bigram_prob("in", "the"))
print("Bigram P('the' | 'in'):", bigram_prob("in", "the"))

print("Trigram P('of' | 'one', 'the'):", trigram_prob("one", "of", "the"))


Bigram P('in' | 'the'): 0.28388615888615887
Bigram P('the' | 'in'): 0.28388615888615887
Trigram P('of' | 'one', 'the'): 0.5379426644182125


Bigram Text Generator

In [6]:
def generate_bigram_text(start_word, length=15):
    word = start_word
    output = [word]

    for _ in range(length):
        # find possible next words
        candidates = [(w2, c) for ((w1, w2), c) in bigram_counts.items() if w1 == word]

        if not candidates:      # no continuation found
            break

        # choose the most common next word
        next_word = max(candidates, key=lambda x: x[1])[0]

        output.append(next_word)
        word = next_word

    return " ".join(output)


Test Bigram Generator

In [7]:
print(generate_bigram_text("the"))


the same time , and the same time , and the same time , and the


Trigram Text Generator

In [8]:
def generate_trigram_text(w1, w2, length=15):
    output = [w1, w2]

    for _ in range(length):
        # find candidates for (w1, w2) -> w3
        candidates = [(w3, c) for ((x1, x2, w3), c) in trigram_counts.items()
                      if x1 == w1 and x2 == w2]

        if not candidates:
            break
        
        # choose most common next word
        next_word = max(candidates, key=lambda x: x[1])[0]
        
        output.append(next_word)
        w1, w2 = w2, next_word

    return " ".join(output)


Test Trigram Generator

In [9]:
print(generate_trigram_text("the", "united"))


the united people of the United States , and the other hand , the first time in


Bigram Text Generator

Install & Import

In [10]:
import nltk
from collections import Counter

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Your Custom Training Text

In [11]:
text = """
I want to learn NLP.
I want to learn Python.
I want to eat momo.
I want to go home.
I want to watch movies.
I want to study data science.
"""


Tokenize

In [12]:
tokens = nltk.word_tokenize(text)
tokens


['I',
 'want',
 'to',
 'learn',
 'NLP',
 '.',
 'I',
 'want',
 'to',
 'learn',
 'Python',
 '.',
 'I',
 'want',
 'to',
 'eat',
 'momo',
 '.',
 'I',
 'want',
 'to',
 'go',
 'home',
 '.',
 'I',
 'want',
 'to',
 'watch',
 'movies',
 '.',
 'I',
 'want',
 'to',
 'study',
 'data',
 'science',
 '.']

Create Bigram Counts

In [14]:
bigrams = list(nltk.bigrams(tokens))
bigram_counts = Counter(bigrams)
unigram_counts = Counter(tokens)


Predict Next Word Function

In [15]:
def predict_next_word(word):
    candidates = [(w2, c) for (w1, w2), c in bigram_counts.items() if w1 == word]

    if not candidates:
        return None

    # choose word with max count
    next_word = max(candidates, key=lambda x: x[1])[0]
    return next_word


Build Sentence from Input

In [16]:
def bigram_sentence(start_words, length=3):
    words = start_words.split()
    last_word = words[-1]

    for _ in range(length):
        next_word = predict_next_word(last_word)
        if not next_word:
            break
        words.append(next_word)
        last_word = next_word

    return " ".join(words)


Test Your Bigram Generator

In [17]:
print(bigram_sentence("I want"))


I want to learn NLP
