Create a corpus

In [10]:
corpus = ["我喜欢吃苹果",
          "我喜欢吃香蕉",
          "她喜欢吃葡萄",
          "他不喜欢吃香蕉",
          "他喜欢吃苹果",
          "她喜欢吃草莓"]

Define tokenizer

In [11]:
def tokenizer(text):
    return [char for char in text]

for c in corpus:
    print(tokenizer(c))

['我', '喜', '欢', '吃', '苹', '果']
['我', '喜', '欢', '吃', '香', '蕉']
['她', '喜', '欢', '吃', '葡', '萄']
['他', '不', '喜', '欢', '吃', '香', '蕉']
['他', '喜', '欢', '吃', '苹', '果']
['她', '喜', '欢', '吃', '草', '莓']


In [12]:
#计算词频
from collections import defaultdict, Counter
def count_ngrams(corpus, n):
    counts = defaultdict(Counter)
    for text in corpus:
        tokens = tokenizer(text)
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i+n])
            prefix = ngram[:-1]
            token = ngram[-1]
            counts[prefix][token] += 1
    return counts

bigram_counts = count_ngrams(corpus, 2)
for prefix, counter in bigram_counts.items():
    print(prefix, counter)

('我',) Counter({'喜': 2})
('喜',) Counter({'欢': 6})
('欢',) Counter({'吃': 6})
('吃',) Counter({'苹': 2, '香': 2, '葡': 1, '草': 1})
('苹',) Counter({'果': 2})
('香',) Counter({'蕉': 2})
('她',) Counter({'喜': 2})
('葡',) Counter({'萄': 1})
('他',) Counter({'不': 1, '喜': 1})
('不',) Counter({'喜': 1})
('草',) Counter({'莓': 1})


In [13]:
#计算概率
def ngrams_probability(ngrams_counts):
    probs = defaultdict(dict)
    for prefix, counter in ngrams_counts.items():
        total = sum(counter.values())
        for token, count in counter.items():
            probs[prefix][token] = count / total
    return probs
bigram_probability = ngrams_probability(bigram_counts)
for prefix, prob in bigram_probability.items():
    print(prefix, prob)

('我',) {'喜': 1.0}
('喜',) {'欢': 1.0}
('欢',) {'吃': 1.0}
('吃',) {'苹': 0.3333333333333333, '香': 0.3333333333333333, '葡': 0.16666666666666666, '草': 0.16666666666666666}
('苹',) {'果': 1.0}
('香',) {'蕉': 1.0}
('她',) {'喜': 1.0}
('葡',) {'萄': 1.0}
('他',) {'不': 0.5, '喜': 0.5}
('不',) {'喜': 1.0}
('草',) {'莓': 1.0}


In [14]:
def generate_text_token(prefix, ngrams_probability):
    if prefix not in ngrams_probability:
        return ""
    next_token_probs = ngrams_probability[prefix]
    next_token = max(next_token_probs, key=next_token_probs.get)
    return next_token

def generate_text(prefix, ngrams_probability, n, max_length=10):
    text = list(prefix)
    for _ in range(max_length-len(prefix)):
        token = generate_text_token(tuple(text[-(n-1):]), ngrams_probability)
        if not token:
            break
        text.append(token)
    return "".join(text)

generated_text = generate_text("我不喜欢", bigram_probability, 2)
print(generated_text)

我不喜欢吃苹果
