# Statistical Language Models



In [2]:
import pymongo
import nltk
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from collections import defaultdict
from tqdm import tqdm
from typing import List
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class MarkovLM:
    """Implements a Markov LM
    """
    def __init__(self, k: int = 2, tokenizer_model: str = "dbmdz/bert-base-italian-uncased"):
        self.k = k
        self.unigram = defaultdict(lambda: 1)
        self.k_index = defaultdict(lambda: defaultdict(lambda: 1))
        self.U = float('inf')
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_model)
        self.start_symbol = "[#S]"
        self.end_symbol = "[#E]"

    def train(self, corpus: List[str]):
        """fill if the indexes

        Args:
            corpus (List[str]): List of textual documents
        """
        for document in tqdm(corpus):
            try:
                tokens = self.tokenizer.tokenize(document)
                for keys in nltk.ngrams(tokens, n=self.k, pad_left=True,
                                        pad_right=True,
                                        left_pad_symbol=self.start_symbol,
                                        right_pad_symbol=self.end_symbol):
                    self.k_index[keys[:-1]][keys[-1]] += 1
                    for k in keys:
                        self.unigram[k] += 1
            except TypeError:
                pass

    def pickup(self, prefix: tuple = None):
        if prefix is None:
            # unigram
            s = pd.Series(self.unigram) / sum(self.unigram.values())
            return np.random.choice(s.index.values, p=s.values)
        else:
            assert len(prefix) == self.k - 1
            data = self.k_index[prefix]
            s = pd.Series(data)
            if s.empty:
                token = self.pickup()
            else:
                s = s / s.sum()
                token = np.random.choice(s.index.values, p=s.values)
            return token

    def generate(self, prefix: tuple = None, unigram: bool = False, max_len: int = 2000):
        text = []
        if prefix is None:
            prefix = tuple([self.start_symbol] * (self.k - 1))
        text.extend(prefix)
        for i in range(max_len):
            if unigram:
                token = self.pickup()
            else:
                token = self.pickup(prefix=prefix)
            text.append(token)
            if token == self.end_symbol:
                break
            else:
                prefix = tuple(text[-(self.k - 1):])
        return text

    def log_prob(self, text: str):
        tokens = self.tokenizer.tokenize(text)
        log_probs = []
        for keys in nltk.ngrams(tokens, n=self.k, pad_left=True,
                                pad_right=True,
                                left_pad_symbol=self.start_symbol,
                                right_pad_symbol=self.end_symbol):
            prefix, next_word = keys[:-1], keys[-1]
            try:
                total = sum(self.k_index[prefix].values())
                count = self.k_index[prefix][next_word]
                log_p = np.log(count / total)
                log_probs.append(log_p)
            except KeyError:
                log_probs.append(0)
            except ZeroDivisionError:
                log_probs.append(0)
        return sum(log_probs)

    @staticmethod
    def read_txt(file_path: str):
        with open(file_path, 'r') as infile:
            text = infile.read()

In [4]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

In [5]:
def create_corpus(query:  dict = {}, numdocs: int = 3000):
    corpus = []
    for recipe in recipes.find(query).limit(numdocs):
        for sentence in recipe['steps']:
            corpus.append(sentence)
    return corpus

numdocs = 3000
corpus = create_corpus(query={}, numdocs=numdocs)
print(f"Corpus size: {len(corpus)}")
for text in corpus[:4]:
    print(text)

Corpus size: 20805
To prepare ravioli, place mushrooms in food processor; pulse 10 times or until finely chopped.
Heat oil and butter in a large nonstick skillet over medium-high heat. Add shallots and garlic, and sauté for 2 minutes.
Add mushrooms and 1/8 teaspoon salt; cook 5 minutes or until moisture evaporates, stirring occasionally.
Working with 1 wonton wrapper at a time (cover remaining wrappers with a damp towel to keep them from drying), spoon about 2 teaspoons mushroom mixture into center of each wrapper.


In [6]:
tokenizer = "bert-base-uncased"

In [7]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
numdocs = 3000
italian_corpus = create_corpus(query=italian_q, numdocs=numdocs)
chinese_corpus = create_corpus(query=chinese_q, numdocs=numdocs)
print(f"Italian: {len(italian_corpus)}")
print(f"Chinese: {len(chinese_corpus)}")

Italian: 31048
Chinese: 24071


In [8]:
it = MarkovLM(k=4, tokenizer_model=tokenizer)
ch = MarkovLM(k=4, tokenizer_model=tokenizer)

In [9]:
it.train(corpus=italian_corpus)
ch.train(corpus=chinese_corpus)

100%|██████████| 31048/31048 [00:14<00:00, 2177.98it/s]
100%|██████████| 24071/24071 [00:12<00:00, 1995.97it/s]


In [12]:
italian_sentence = italian_corpus[6]
chinese_sentence = chinese_corpus[6]

print(f"Italian sentence: {italian_sentence}")
print(f"Italian: {it.log_prob(italian_sentence)}")
print(f"Chinese: {ch.log_prob(italian_sentence)}")
print("========")
print(f"Chinese sentence: {chinese_sentence}")
print(f"Italian: {it.log_prob(chinese_sentence)}")
print(f"Chinese: {ch.log_prob(chinese_sentence)}")

Italian sentence: To prepare sauce, combine milk and flour in a small saucepan over medium-low heat; stir with a whisk.
Italian: -35.87231088425833
Chinese: -39.72849731483897
Chinese sentence: Cut bell pepper into thin strips and fry until the texture is soft.  Add scallions about 5 minutes after the bell pepper.
Italian: -39.1592348306556
Chinese: -39.93073368561632
