In [None]:
"""
写了课件里的几个平滑算法：加一法，good-turing,Katz回退法，绝对减值，线性减值
加一法，good-turing,绝对减值，线性减值适用2gram。3gram时容易出现历史信息即前两个词的序列不存在概率为零的情况
Katz回退法可以3gram，回退用递归写的，3gram时算的比较慢。

没有考虑遇到未登录词的情况。
未登录词写成UNK后怎么加入训练集怎么计算概率？

训练集train1.txt是截的corpus.txt的一小部分.
"""

In [148]:
import re

from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from collections import Counter
from nltk.corpus import wordnet

#处理句子，加n-1个"<BOS> "和" <EOS>"
def chuli(s, n): 

    # Convert to lowercases
    s = s.lower()
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    s = s.strip() #去掉换行符
    for i in range(n-1):  #加n-1个"<BOS> "和" <EOS>"
        s = "<BOS> " + s + " <EOS>"
    return s

#将句子切成n元词组
def ngram_generator(s, n): 
    # Break sentence in the token, remove empty tokens
    token = [token for token in s.split(" ") if token != ""]

    # Stemming and Lemmatizing
    lemmatizer = WordNetLemmatizer()
    tagged = pos_tag(token)
    token = []
    for word, tag in tagged:
        wntag = get_wordnet_pos(tag)
        if wntag is None:  # not supply tag in case of None
            lemma = lemmatizer.lemmatize(word)
            token.append(lemma)
        else:
            lemma = lemmatizer.lemmatize(word, pos=wntag)
            token.append(lemma)

    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[token[i:] for i in range(n)])
    return ngrams

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

#读取文档，将n元词组存入字典
def diction(file, n, k): #参数k表示为每个句子增加（k-1)个<BOS>,<EOS>
    with open(file, 'r') as f:
        diction = Counter()
        for line in f:
            if line.strip() == "":
                continue
            line = chuli(line, k)
            sentence = ngram_generator(line, n)
            diction += Counter(sentence)
    return diction

#生成词组的频率数组fre，fre[i]储存i元词组的频率字典, 0<=i<=n
def frequency(file,n):
    fre = []
    for i in range(1,n+1):
        word = diction(file, i, n)
        fre.append(word)
    num = sum(fre[0].values())
    fre = [{("num",):num}] + fre       
    return fre

#对一个词word,计算未平滑的概率
def unsmooth(word, n, frequency):
    count = frequency[n].get(word, 0)
    pre_word = word[:-1] if len(word[:-1]) >= 1 else ("num",)
    p = count / frequency[n-1][pre_word]
    return p
        
#对一个词word,计算加一法平滑的概率
def jiayifa(word, n, frequency):
    count = frequency[n].get(word, 0)
    N = len(frequency[1]) - (2 if n > 1 else 0)
    pre_word = word[:-1] if len(word[:-1]) >= 1 else ("num",)        
    p = (count + 1) / (frequency[n-1][pre_word] + N)
    return p

#对一个词word,计算good-turing平滑的概率
def good_turing(word, n, frequency):
    if n == 1:
        return unsmooth(word, 1, frequency)       
        
    pre_word = word[:-1] if len(word[:-1]) >= 1 else ("num",)
    num = frequency[n-1][pre_word]
    R_n = {}
    for key, value in frequency[n].items():
        if key[:-1] == word[:-1]:
            R_n[value] = R_n.get(value, 0) + 1                     
    R_n[0] = len(frequency[1]) - sum(R_n.values())
    RtoRxing = {}
    r_max = max(R_n.keys())
    r = 0
    while(r<r_max):
        if r in R_n:
            rj = r + 1
            while(rj not in R_n):
                rj += 1
            RtoRxing[r] = R_n[rj] * rj / R_n[r]
            r = rj
        else:
            r += 1
    RtoRxing[r_max] = r_max
    N = 0
    for key in R_n:
        N += R_n[key] * RtoRxing[key]
    rk = frequency[n].get(word, 0)
    p = RtoRxing[rk] / N
    return p

#对一个词word,计算Katz平滑的概率， Katz里面含递归，算的慢。
#Katz是回退法可以算3gram，可以处理3gram中前两个词排列未出现的情况。
def Katz(word, n, frequency):
    if n == 1:
        return unsmooth(word, 1, frequency) 
    if word in frequency[n]:
        return good_turing(word, n, frequency)
    else:
        beta = 0
        Na = 0
        wordin = []
        for word2 in frequency[n]:
            if word2[:-1] == word[:-1]:
                wordin.append(word2[-1])
                beta += good_turing(word2, n, frequency)
        for word3 in frequency[1]:
            if word3 not in wordin:
                pre_word = word[1:-1] + word3
                Na += Katz(pre_word, n - 1, frequency)
                              
        alpha = (1- beta) / Na
        return alpha * Katz(word[1:], n - 1, frequency)

#对一个词word,计算绝对减值法平滑的概率，b为自由参数0<b<=1
def absolute_discouting(word, n, frequency, b=0.4): #b为自由参数0<b<=1
    if n == 1:
        return unsmooth(word, 1, frequency)       
    
    N = frequency[n-1][word[:-1]]
    num = 0
    for key in frequency[n].keys():
        if key[:-1] == word[:-1]:
            num += 1
    if word in frequency[n]:
        p = (frequency[n][word] - b) / N
    else:
        p = b * num / (len(frequency[1]) - num) / N        
    return p

#对一个词word,计算线性减值法平滑的概率
def linear_discouting(word, n, frequency):
    if n == 1:
        return unsmooth(word, 1, frequency)       
    
    N = frequency[n-1][word[:-1]]
    num = 0
    n1 = 0
    for key in frequency[n].keys():
        if key[:-1] == word[:-1]:
            num += 1
            if frequency[n][key] == 1:
                n1 += 1
    alpha = n1 / N
    if word in frequency[n]:
        p = frequency[n][word] * (1 - alpha) / N
    else:
        p = alpha / (len(frequency[1]) - num)       
    return p
    
#计算句子s的概率    
def calP123(s, n,frequency, smooth):
    s = chuli(s, n)
    t = ngram_generator(s, n)
    P = 1
    sentence = []
    for word in t:
        pk = smooth(word, n, frequency)
        P *= pk
        sentence.append([word,pk])
    print(str(smooth))
    print(sentence)
    print(s, "概率是", P)
    return P    
        
#计算句子s的概率， 加了try,词组在smooth频率数组中不存在时，概率为0          
def calP(s, n,frequency, smooth):
    s = chuli(s, n)
    t = ngram_generator(s, n)
    P = 1
    sentence = []
    for word in t:
        try:
            pk = smooth(word, n, frequency)
        except:
            pk = 0
        P *= pk
        sentence.append([word,pk])
    print(str(smooth))
    print(sentence)
    print(s,ss, "概率是", P)
    return P


In [149]:
#测试

file = 'train_LM.txt'
s = "in a hospital  a student I am ."
fre = frequency(file,2)


p3 = calP123(s, 2, fre,good_turing) # good_turing
p = calP123(s, 2, fre, Katz)#Katz回退法，里面有递归
p2 = calP123(s, 2, fre, absolute_discouting)#绝对减值
p2 = calP123(s, 2, fre, linear_discouting)#线性减值


<function good_turing at 0x000002361A700BF8>
[[('<BOS>', 'in'), 0.025641025641025644], [('in', 'a'), 0.5], [('a', 'hospital'), 0.16666666666666666], [('hospital', 'a'), 0.03333333333333333], [('a', 'student'), 0.16666666666666666], [('student', 'i'), 0.03333333333333333], [('i', 'be'), 0.5], [('be', '<EOS>'), 0.025641025641025644]]
<BOS> in a hospital  a student i am <EOS> 概率是 5.07301077101647e-09
<function Katz at 0x000002361A7001E0>
[[('<BOS>', 'in'), 0.01190476190476191], [('in', 'a'), 0.5], [('a', 'hospital'), 0.16666666666666666], [('hospital', 'a'), 0.05357142857142859], [('a', 'student'), 0.16666666666666666], [('student', 'i'), 0.035714285714285726], [('i', 'be'), 0.5], [('be', '<EOS>'), 0.04761904761904764]]
<BOS> in a hospital  a student i am <EOS> 概率是 7.532066114427642e-09
<function absolute_discouting at 0x000002361A6A0E18>
[[('<BOS>', 'in'), 0.02307692307692308], [('in', 'a'), 0.6], [('a', 'hospital'), 0.19999999999999998], [('hospital', 'a'), 0.02666666666666667], [('a', 

In [150]:
#测试

file = 'train1.txt'
fre = frequency(file,2)

s = " such as deep neural networks learning architectures"

p = calP123(s, 2,fre, unsmooth) #未平滑
p = calP123(s, 2,fre, jiayifa) #加一法
p = calP123(s, 2,fre, good_turing) # good_turing
p = calP123(s, 2, fre, absolute_discouting) #绝对减值
p = calP123(s, 2, fre, linear_discouting) #线性减值
p = calP123(s, 2,fre, Katz)   #Katz回退法，里面有递归

<function unsmooth at 0x000002360F823488>
[[('<BOS>', 'such'), 0.0], [('such', 'a'), 0.6666666666666666], [('a', 'deep'), 0.015873015873015872], [('deep', 'neural'), 0.07142857142857142], [('neural', 'network'), 0.7142857142857143], [('network', 'learn'), 0.0], [('learn', 'architecture'), 0.0], [('architecture', '<EOS>'), 0.0]]
<BOS> such as deep neural networks learning architectures <EOS> 概率是 0.0
<function jiayifa at 0x000002360F750D08>
[[('<BOS>', 'such'), 0.0004363001745200698], [('such', 'a'), 0.003371868978805395], [('a', 'deep'), 0.0017730496453900709], [('deep', 'neural'), 0.0009610764055742432], [('neural', 'network'), 0.0028929604628736743], [('network', 'learn'), 0.0004816955684007707], [('learn', 'architecture'), 0.000481000481000481], [('architecture', '<EOS>'), 0.00048355899419729207]]
<BOS> such as deep neural networks learning architectures <EOS> 概率是 8.125404951193938e-25
<function good_turing at 0x000002361A700BF8>
[[('<BOS>', 'such'), 0.00015469008436559217], [('such'

In [146]:
file = 'train1.txt'
fre = frequency(file,3)
s = " such as deep neural networks learning architectures"
p = calP123(s, 3,fre, Katz)#Katz回退法，里面有递归，3gram

<function Katz at 0x000002361A6A0E18>
[[('<BOS>', '<BOS>', 'such'), 4.376425738714748e-05], [('<BOS>', 'such', 'a'), 0.4038569706709463], [('such', 'a', 'deep'), 0.08333333333333333], [('a', 'deep', 'neural'), 0.16666666666666666], [('deep', 'neural', 'network'), 0.5], [('neural', 'network', 'learn'), 0.00046240307320193644], [('network', 'learn', 'architecture'), 0.012509374979259162], [('learn', 'architecture', '<EOS>'), 0.01493131594664462], [('architecture', '<EOS>', '<EOS>'), 0.5153814602132937]]
<BOS> <BOS> such as deep neural networks learning architectures <EOS> <EOS> 概率是 5.4634610910075036e-15
