In [19]:
import tqdm
import re
import numpy as np
from collections import Counter
from itertools import product
import random
import pickle
import urllib
import gzip

In [20]:
class LanguageModel():
    def __init__(self, order, lambdas, counts=None):
        self.order = order
        self.lambdas = lambdas
        if counts == 'default':
            print('... please wait ... loading model ...')
            self.counts = pickle.load(gzip.open('default.pkl.gz'))
        else: self.counts = counts
        
    def product(self, nums):
        "Multiply the numbers together.  (Like `sum`, but with multiplication.)"
        result = 1
        for x in nums: result *= x
        return result

    def get_ngrams(self, tokens, n):
        return [' '.join(tokens[i:i+n]) for i, token in enumerate(tokens)]
    
    def get_counts(self, corpus, order):  
        counts = {'n' + str(i) : Counter(self.get_ngrams(corpus, n=i)) for i in range(1, order+1)}
        counts['n0'] = {'':len(corpus)}
        return counts
    
    def get_prob(self, counts, word, context=''):
        '''With Laplace shoothing as yet.
        Not for public use.'''
        order = len(context.split())+1
        separator = ' ' if order > 1 else ''
        return (counts['n'+str(order)][separator.join([context, word])] + 1) / \
               (counts['n'+str(order-1)][context] + len(counts['n'+str(order)]))
        
    def get_logprob(self, counts, word, context=''):
        return np.log(self.get_prob(counts, word, context))
    
    def get_following(self, counts, context):
        '''Slow as hell. 
        To optimize might use embedded dictionaries.'''
        order = len(context.split())+1
        return sorted(
            [(k.split()[-1], v, self.get_prob(counts, k.split()[-1], context)) \
            for k, v in counts['n'+str(order)].items()                         \
            if re.match(context+' '+'\w+', k)],                                \
            key=lambda x:x[1], reverse=True)   
    
    def get_string_probs(self, counts, string, order, log=True):
        prob_fun = self.get_logprob if log else self.get_prob
        tokens = string.split()
        probs = []
        for i in range(len(tokens)):
            context = ' '.join(tokens[i-order+1:i]) if i>=order else ' '.join(tokens[:i])
            prob = prob_fun(counts, word = tokens[i], context = context)
            probs.append(prob)
        return probs
    
    def interpolate(self, counts, string, order, log=True, lambdas='default'):
        lmbd = self.lambdas if lambdas == 'default' else lambdas
        aggregate = sum if log else self.product
        probs = [self.get_string_probs(counts, string, order=i, log=log) \
                 for i in range(1, order+1)]
        probs_interpolated = []
        for tup in zip(*probs):
            prob_token = 0
            for i in range(len(tup)):
                prob_token += tup[i] * lmbd[i]
            probs_interpolated.append(prob_token)
        return aggregate(probs_interpolated)
    
    def fit(self, corpus):
        self.counts = self.get_counts(corpus, self.order)
        
    def prob(self, string, log=False):
        return self.interpolate(self.counts, string, self.order, log=log)
    
    def context_prob(self, word, context='', log=False):
        prob_fun = self.get_logprob if log else self.get_prob
        c = context.split()
        history = ' '.join(c) if len(c) < self.order else ' '.join(c[-self.order+1:])
        return prob_fun(self.counts, word, history)  
    
    def following(self, context):
        c = context.split()
        history = ' '.join(c) if len(c) < self.order else ' '.join(c[-self.order+1:])
        return self.get_following(self.counts, history)

In [21]:
class Candidator():
    def __init__(self, dictionary, abc='йцукенгшщзхъфывапролджэячсмитьбю', check_all=True):
        self.abc = abc
        self.dictionary = set(dictionary)
        self.check_all = check_all
        
    def edits(self, word): # mostly from norvig, modified for faster dict search
        letters = self.abc
        d = self.dictionary
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [ed for ed in [L + R[1:] for L, R in splits if R] if ed in d]
        transposes = [ed for ed in [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] \
                      if ed in d]
        replaces = [ed for ed in [L + c + R[1:] for L, R in splits if R for c in letters] \
                    if ed in d]
        inserts = [ed for ed in [L + c + R for L, R in splits for c in letters] if ed in d]
        return set(deletes + transposes + replaces + inserts + [word])
    
    def get_sentences(self, text):
        return text.lower().split('. ')
    
    def word_candidates(self, sent):
        if self.check_all:
            return [self.edits(word) for word in sent.lower().split()]
        else:
            return [self.edits(word) if (word not in self.dictionary) else {word} for word in sent.lower().split()]
    
    def sent_candidates(self, sent):
        return [candidate for candidate in product(*self.word_candidates(sent))]
    
    def candidates(self, sent):
        return [' '.join(candidate) for candidate in self.sent_candidates(sent)]

In [38]:
class Ranker():
    def __init__(self, lang_model, candidator):
        self.lm = lang_model
        self.c = candidator

In [17]:
def cleanse(s, rgxp = '[\W\da-z]'):
    return re.sub(' +', ' ', re.sub(rgxp, ' ', s.lower()))

In [9]:
%%time
#open corpora
with open('corpora.txt', encoding='utf-8') as f:
    tokens1 = cleanse(f.read().lower()).split()

Wall time: 6.11 s


In [28]:
%%time
#hagen dictionary
f = open('hagen-orf.txt', encoding='utf-8').readlines()
d2 = [w for w in [re.findall('^ *(.+) |', l)[0].split(' | ')[0] for l in f] if len(w)>0]
print(len(d2))

770034
Wall time: 9.55 s


In [23]:
%%time
#ruscorpora tokens
f = open('ruscorp_tokens', encoding='utf-8').read()
d3 = [w for w in f.split('\n') if w]
print(len(d3))

2164509
Wall time: 1.39 s


In [24]:
%%time
#initialize model
c = Candidator(d3, check_all=True)
lm = LanguageModel(order=3, lambdas = [0.01, 0.09, 0.9], counts='default')

...please wait...loading model...
Wall time: 9.03 s


In [25]:
lm.prob('есть', log=True)

-6.5972435059483105

In [27]:
%%time
cand_list = c.candidates('школа жлословия')
cand_probs = sorted([(cand, lm.prob(cand, log=True)) for cand in cand_list], 
                    key=lambda x:x[1], reverse=True)

Wall time: 15.5 ms


In [28]:
cand_probs[:10]

[('школу злословия', -23.129950530430012),
 ('школы злословия', -24.093610290439713),
 ('школы жлословия', -24.118459356937592),
 ('школе злословия', -24.282921020469818),
 ('школе жлословия', -24.307770086967697),
 ('школу жлословия', -24.527231014436584),
 ('школа злословия', -24.962734768770041),
 ('школа жлословия', -24.98758383526792),
 ('школ злословия', -25.536953000391911),
 ('школ жлословия', -25.561802066889793)]

In [147]:
%%time
c = Candidator(d3, check_all=True)

Wall time: 586 ms


In [29]:
%%time
cand_list = c.candidates('пошел на жработу')
cand_probs = sorted([(cand, lm.prob(cand, log=True)) for cand in cand_list], 
                    key=lambda x:x[1], reverse=True)

Wall time: 1.13 s


In [30]:
cand_probs[:10]

[('пошел на работу', -26.619422179179061),
 ('пошел на жработу', -30.606198045060964),
 ('пошел за работу', -31.238755316214494),
 ('пошел за жработу', -31.959942075696514),
 ('пошел а работу', -32.316039600637453),
 ('прошел на работу', -32.578627674202906),
 ('пошел а жработу', -32.686962533289517),
 ('пошел не работу', -32.738626591133951),
 ('пошел не жработу', -33.06357521764707),
 ('пошел но работу', -33.344892891710543)]

In [51]:
c.word_candidates('россия ржссия федерация фодерация')

[{'россея',
  'росси',
  'россиз',
  'россии',
  'россий',
  'россиу',
  'россию',
  'россия',
  'россиян',
  'россиях',
  'росссия',
  'росция',
  'руссия'},
 {'ржссия', 'россия', 'руссия'},
 {'федераци',
  'федерации',
  'федераций',
  'федерацию',
  'федерация',
  'федерациям',
  'федерациях'},
 {'модерация', 'федерация', 'фодерация'}]

In [52]:
%%time
cand_list = c.candidates('в ржссию')
cand_probs = sorted([(cand, lm.prob(cand, log=True)) for cand in cand_list], 
                    key=lambda x:x[1], reverse=True)

Wall time: 15.2 ms


In [53]:
cand_probs[:10]

[('в россию', -11.028868680610294),
 ('и россию', -14.202185246660072),
 ('а россию', -17.462553327810316),
 ('я россию', -18.52639598157851),
 ('и руссию', -19.104189647565324),
 ('и ржссию', -19.118052591176525),
 ('в руссию', -19.156969810616122),
 ('в ржссию', -19.17083275422732),
 ('с россию', -19.633803812151996),
 ('с руссию', -19.709362863088106)]

In [54]:
%%time
cand_list = c.candidates('вкучный обет')
cand_probs = sorted([(cand, lm.prob(cand, log=True)) for cand in cand_list], 
                    key=lambda x:x[1], reverse=True)

Wall time: 10.8 ms


In [55]:
cand_probs[:10]

[('вкусный обед', -24.30227677610586),
 ('вкусный обе', -27.310351774023435),
 ('вкусный одет', -27.326745531639681),
 ('вкусный орет', -27.331101799724927),
 ('вкусный обер', -27.338192005022091),
 ('вкусный обет', -27.342407949922471),
 ('вкусный бет', -27.356517819659572),
 ('вкусный обут', -27.357207748374442),
 ('вкусный облет', -27.35757142481615),
 ('вкусный обеты', -27.358341035227511)]

In [56]:
lm.counts['n1']['без']

120757

In [57]:
lm.prob('вкусный обед', log=True)

-24.30227677610586

In [58]:
lm.prob('вкусный обет', log=True)

-27.342407949922471

In [59]:
lm.following('вкусный')

[('и', 30, 1.174672247496622e-05),
 ('обед', 20, 7.957457160460987e-06),
 ('запах', 19, 7.578530629010464e-06),
 ('чай', 9, 3.789265314505232e-06),
 ('ужин', 9, 3.789265314505232e-06),
 ('кофе', 9, 3.789265314505232e-06),
 ('суп', 8, 3.4103387830547085e-06),
 ('как', 5, 2.2735591887031393e-06),
 ('а', 5, 2.2735591887031393e-06),
 ('торт', 4, 1.894632657252616e-06),
 ('пирог', 4, 1.894632657252616e-06),
 ('борщ', 3, 1.5157061258020928e-06),
 ('в', 3, 1.5157061258020928e-06),
 ('он', 3, 1.5157061258020928e-06),
 ('дым', 3, 1.5157061258020928e-06),
 ('хлеб', 3, 1.5157061258020928e-06),
 ('если', 3, 1.5157061258020928e-06),
 ('что', 3, 1.5157061258020928e-06),
 ('но', 3, 1.5157061258020928e-06)]