# Statistical Language Models (Markov LM)



In [3]:
import pymongo
import copy

from utils.markov_lm import MarkovLM

In [4]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

In [5]:
def create_corpus(query:  dict = {}, numdocs: int = 3000):
    corpus = []
    for recipe in recipes.find(query).limit(numdocs):
        for sentence in recipe['steps']:
            corpus.append(sentence)
    return corpus

numdocs = 3000
corpus = create_corpus(query={}, numdocs=numdocs)
print(f"Corpus size: {len(corpus)}")
for text in corpus[:4]:
    print(text)

Corpus size: 20805
To prepare ravioli, place mushrooms in food processor; pulse 10 times or until finely chopped.
Heat oil and butter in a large nonstick skillet over medium-high heat. Add shallots and garlic, and sauté for 2 minutes.
Add mushrooms and 1/8 teaspoon salt; cook 5 minutes or until moisture evaporates, stirring occasionally.
Working with 1 wonton wrapper at a time (cover remaining wrappers with a damp towel to keep them from drying), spoon about 2 teaspoons mushroom mixture into center of each wrapper.


In [6]:
tokenizer = "bert-base-uncased"

## 1. Text Generation using a Markov Language Model

In [7]:
tokenizer = "bert-base-uncased"
brlm = MarkovLM(k=2, tokenizer_model=tokenizer) # Bi-grams
frlm = MarkovLM(k=4, tokenizer_model=tokenizer) # Four-grams

In [8]:
brlm.train(corpus=corpus)
frlm.train(corpus=corpus)

100%|██████████| 20805/20805 [00:07<00:00, 2721.07it/s]
100%|██████████| 20805/20805 [00:09<00:00, 2170.45it/s]


In [9]:
print("2gram: ", " ".join(brlm.generate()).replace(" ##", ""))
print("4gram: ", " ".join(frlm.generate()).replace(" ##", ""))

2gram:  [#S] for a richer characteristic ! [#E]
4gram:  [#S] [#S] [#S] dip chicken cutlets in batches , grind the cacao nibs , pumpkin seeds , sesame seeds , flaxseed , chia seed , oregano & amp ; parmesan cheese . [#E]


## 2. Text Classification using a Markov Language Model

In [7]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
numdocs = 3000
italian_corpus = create_corpus(query=italian_q, numdocs=numdocs)
chinese_corpus = create_corpus(query=chinese_q, numdocs=numdocs)
print(f"Italian: {len(italian_corpus)}")
print(f"Chinese: {len(chinese_corpus)}")

Italian: 31048
Chinese: 24071


In [8]:
it = MarkovLM(k=4, tokenizer_model=tokenizer)
ch = MarkovLM(k=4, tokenizer_model=tokenizer)

In [9]:
it.train(corpus=italian_corpus)
ch.train(corpus=chinese_corpus)

100%|██████████| 31048/31048 [00:14<00:00, 2177.98it/s]
100%|██████████| 24071/24071 [00:12<00:00, 1995.97it/s]


In [12]:
italian_sentence = italian_corpus[6]
chinese_sentence = chinese_corpus[6]

print(f"Italian sentence: {italian_sentence}")
print(f"Italian: {it.log_prob(italian_sentence)}")
print(f"Chinese: {ch.log_prob(italian_sentence)}")
print("========")
print(f"Chinese sentence: {chinese_sentence}")
print(f"Italian: {it.log_prob(chinese_sentence)}")
print(f"Chinese: {ch.log_prob(chinese_sentence)}")

Italian sentence: To prepare sauce, combine milk and flour in a small saucepan over medium-low heat; stir with a whisk.
Italian: -35.87231088425833
Chinese: -39.72849731483897
Chinese sentence: Cut bell pepper into thin strips and fry until the texture is soft.  Add scallions about 5 minutes after the bell pepper.
Italian: -39.1592348306556
Chinese: -39.93073368561632


 ## 3. Combining Languages

We can create a model that generates italian-chinese cuisine

In [15]:
# We simply train on both corpuses

mix = copy.deepcopy(it)
mix.train(chinese_corpus)

100%|██████████| 24071/24071 [00:11<00:00, 2018.02it/s]


In [19]:
print("Mix: ", " ".join(mix.generate()).replace(" ##", ""))

Mix:  [#S] [#S] [#S] place the noodles in a separate sauce pan , combine the plums . cut the dough into 4 and transfer 2 of the green onions side ways into thin strips and fry until the eggs have been dropped , stir in the shiitake mushrooms until soft and caramelized . [#E]
