In [39]:
from collections import defaultdict, Counter

corpus = [
    "我喜欢吃苹果",
    "我喜欢吃香蕉",
    "她喜欢吃葡萄",
    "他不喜欢吃⾹蕉",
    "他喜欢吃苹果",
    "她喜欢吃草莓"
]
n = 3 # n-gram

# 分词函数
def tokenize(text):
    return [ch for ch in text]

# 计算n-grams计数
def count_ngrams(corpus, n):
    ngrams_count = defaultdict(Counter)
    for text in corpus:
        tokens = tokenize(text)
        for i in range(len(tokens)-n+1):
            ngram = tuple(tokens[i:i+n])
            prefix = ngram[:-1]
            # print(prefix)
            token = ngram[-1]
            ngrams_count[prefix][token] += 1
    return ngrams_count


ngram_counts = count_ngrams(corpus, n)
print(f"{n}-gram词频：")
for prefix, counts in ngram_counts.items():
    print("{}: {}".format(prefix, dict(counts)))


3-gram词频：
('我', '喜'): {'欢': 2}
('喜', '欢'): {'吃': 6}
('欢', '吃'): {'苹': 2, '香': 1, '葡': 1, '⾹': 1, '草': 1}
('吃', '苹'): {'果': 2}
('吃', '香'): {'蕉': 1}
('她', '喜'): {'欢': 2}
('吃', '葡'): {'萄': 1}
('他', '不'): {'喜': 1}
('不', '喜'): {'欢': 1}
('吃', '⾹'): {'蕉': 1}
('他', '喜'): {'欢': 1}
('吃', '草'): {'莓': 1}


In [40]:
def ngram_probabilities(ngram_counts):
    ngram_probs = defaultdict(Counter)
    for prefix, tokens_count in ngram_counts.items():
        total_count = sum(tokens_count.values())
        for token, count in tokens_count.items():
            ngram_probs[prefix][token] = count / total_count
    return ngram_probs

ngram_probs = ngram_probabilities(ngram_counts)
print(f"{n}-gram 出现概率: ")
for prefix, probs in ngram_probs.items():
    print(f"{prefix}: {dict(probs)}")

3-gram 出现概率: 
('我', '喜'): {'欢': 1.0}
('喜', '欢'): {'吃': 1.0}
('欢', '吃'): {'苹': 0.3333333333333333, '香': 0.16666666666666666, '葡': 0.16666666666666666, '⾹': 0.16666666666666666, '草': 0.16666666666666666}
('吃', '苹'): {'果': 1.0}
('吃', '香'): {'蕉': 1.0}
('她', '喜'): {'欢': 1.0}
('吃', '葡'): {'萄': 1.0}
('他', '不'): {'喜': 1.0}
('不', '喜'): {'欢': 1.0}
('吃', '⾹'): {'蕉': 1.0}
('他', '喜'): {'欢': 1.0}
('吃', '草'): {'莓': 1.0}


In [41]:
def gen_next_token(prefix, ngram_probs):
    if not prefix in ngram_probs:
        print(f"{prefix} not found in ngram_probs")
        return None
    next_token_probs = ngram_probs[prefix]
    next_token = max(next_token_probs, key=next_token_probs.get) # 查找字典最大值对应的键
    return next_token

def generate_text(prefix, ngram_probs, n, length=6):
    tokens = list(prefix)
    for _ in range(length-len(prefix)):
        # 获取当前前缀的下一个词
        cur_tokens = tuple(tokens[-(n-1):])
        print("当前token: {}".format("".join(cur_tokens)))
        next_token = gen_next_token(cur_tokens, ngram_probs)
        if not next_token:
            break
        print("下一个token:{}".format(next_token))
        tokens.append(next_token)
    return "".join(tokens)

In [42]:
text = "我不喜欢"
print("输入:", text)
text = generate_text(text, ngram_probs, n, length=10)
print("结果:", text)


输入: 我不喜欢
当前token: 喜欢
下一个token:吃
当前token: 欢吃
下一个token:苹
当前token: 吃苹
下一个token:果
当前token: 苹果
('苹', '果') not found in ngram_probs
结果: 我不喜欢吃苹果
