In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, Laplace, KneserNeyInterpolated

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Training corpus
corpus = [
    "Tôi thích học máy và xử lý ngôn ngữ tự nhiên",
    "Ngôn ngữ tự nhiên là một lĩnh vực thú vị",
    "Chúng ta có thể sử dụng mô hình n-gram để xử lý văn bản",
    "Xử lý ngôn ngữ tự nhiên rất quan trọng trong AI"
]

# Preprocess: tokenize
tokenized_text = [word_tokenize(sent.lower()) for sent in corpus]

In [4]:
n = 3  # n-gram

In [5]:
# 1. Maximum Likelihood Estimation
train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
model_mle = MLE(n)
model_mle.fit(train_data, padded_vocab)

In [6]:
# 2. Smoothing with Laplace
train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
model_laplace = Laplace(n)
model_laplace.fit(train_data, padded_vocab)

In [7]:
# 3. Kneser-Ney smoothing (discounting + backoff + interpolation)
train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
model_kn = KneserNeyInterpolated(n)
model_kn.fit(train_data, padded_vocab)

In [9]:
# Predict the next token for "xử lý"
context = ['xử', 'lý']

print("MLE:", model_mle.generate(1, text_seed=context))
print("Laplace:", model_laplace.generate(1, text_seed=context))
print("Kneser-Ney:", model_kn.generate(1, text_seed=context))

# Compute the probability P(w|"xử lý")
print("\nP('ngôn' | 'xử lý'):", model_kn.score('ngôn', ['xử', 'lý']))

MLE: văn
Laplace: ngôn
Kneser-Ney: văn

P('ngôn' | 'xử lý'): 0.6636434108527132


# Without NLTK

In [10]:
def tokenize_sentences(corpus):
    return [sentence.lower().split() for sentence in corpus]

def generate_ngrams(tokens, n):
    padded = ['<s>'] * (n - 1) + tokens + ['</s>']
    return [tuple(padded[i:i + n]) for i in range(len(padded) - n + 1)]

In [11]:
from collections import defaultdict, Counter

def count_ngrams(tokenized_sentences, n):
    ngram_counts = defaultdict(Counter)
    for sentence in tokenized_sentences:
        ngrams = generate_ngrams(sentence, n)
        for gram in ngrams:
            context = gram[:-1]
            word = gram[-1]
            ngram_counts[context][word] += 1
    return ngram_counts

In [12]:
tokenized = tokenize_sentences(corpus)

unigram = count_ngrams(tokenized, 1)
bigram = count_ngrams(tokenized, 2)
trigram = count_ngrams(tokenized, 3)