# Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
from pprint import pprint, pformat
from utils import read, split, build_vocabulary, extract_sentences
from PCFG import PCFG
from OOV import OOV
from Word2Vec import Word2Vec
from Ngram import Automaton
from main import main
import nltk

import time
import numpy as np
import progressbar

from PYEVALB import scorer
from PYEVALB import parser

# Data

In [3]:
[corpus_train, 
 corpus_val, 
 corpus_test, 
 sentences_train, 
 POS_train, 
 sentences_val, 
 POS_val, 
 sentences_test, 
 POS_test,
vocabulary,
w2v,
ngram,
oov] = main()

Train corpus length : 2790 (90%)
Valid corpus length : 0 (0%)
Test corpus length : 309 (Last 10%) 

Train vocabulary size:  9633
100004 (100004, 64)
2177/9633 words from train set not in polyglot embedding
9633 (9633, 64)


In [4]:
pcfg = PCFG()
pcfg.fit(corpus_train)
#allPOS = list(set(pcfg.POS).union(pcfg.pcfg_.keys()))
#print(len(allPOS))
#allPOS.pop(allPOS.index("SENT"))
#print(len(allPOS))
#allPOS = ["SENT"] + allPOS

In [5]:
def P_CYK(sentence,pcfg):
    
    r = len(pcfg.non_terminals) 
    n = len(sentence)
    P = np.zeros((n,n,r))
    back = np.empty((n,n,r),dtype=np.ndarray)

    for s,word in enumerate(sentence):
        for pos in pcfg.lexicon_[word]:
            v = pcfg.pos2index[pos]
            P[0,s,v] = pcfg.lexicon_[word][pos]
    
    #browse unaries
    for s in range(n):
        for prod in pcfg.pcfg_._productions:
            if len(prod._rhs)==1:
                v1 = pcfg.pos2index[prod._lhs]
                v2 = pcfg.pos2index[prod._rhs[0]]
                prob_transition = P[0,s,v2] * prod._ProbabilisticMixIn__prob
                if P[0,s,v2]>0 and prob_transition > P[0,s,v1]: 
                    P[0,s,v1] = prob_transition
                    back[0,s,v1] = (0,v2,None)
                    
    #browse binary rules
    for l in range(2,n+1): #length of span
        for s in range(1,n-l+2): #start of span
            for p in range(0,l): #partition of span 
                for prod in pcfg.pcfg_._productions:
                    if len(prod._rhs)==2: #if binary rule
                        #print("cc")
                        a = pcfg.pos2index[prod._lhs]
                        Rb = prod._rhs[0]
                        Rc = prod._rhs[1]
                        b = pcfg.pos2index[Rb]
                        c = pcfg.pos2index[Rc]
                        prob_prod = prod._ProbabilisticMixIn__prob
                        
                        prob_splitting = prob_prod * P[p-1,s-1,b] * P[l-p-1,s+p-1,c]
                        if P[p-1,s-1,b] > 0 and P[l-p-1,s+p-1,c] > 0 and P[l-1,s-1,a] < prob_splitting:
                            #print(prod)
                            P[l-1,s-1,a] = prob_splitting
                            back[l-1,s-1,a] = (p,b,c)
    
    return P,back 

def build_tree(backp,sentence,pcfg,length=-1,start=0,pos=0): 
    S = ''
    indexes = backp[length,start,pos]
    if indexes is None: 
        return sentence[start]
    else: 
        p,b,c = indexes
        if c is None and not b is None: 
            return "(" + pcfg.non_terminals[b]._symbol+ " " + sentence[start] + ")"
        #print(non_terminals[pos],non_terminals[b],non_terminals[c])
        S += "(" +pcfg.non_terminals[b]._symbol+" "+ build_tree(backp,sentence,pcfg,p-1,start,b) +")"
        S += "(" +pcfg.non_terminals[c]._symbol+" "+ build_tree(backp,sentence,pcfg,length-p,start+p,c) +")" 
            
    return S

def correct_string(string):
    return '( (SENT' + string + '))'

def un_chomsky(bracket_string):
    
    tree = nltk.tree.Tree.fromstring(correct_string(bracket_string))
    nltk.treetransforms.un_chomsky_normal_form(tree)
    result = tree.pformat().replace('\n','')
    return result

# Tune hypeparameters

In [6]:
true_trees = []
sentences = []
for i,s in enumerate(sentences_test):
    if len(s)<15:
        true_trees.append(corpus_test[i])
        sentences.append(s)

In [8]:
new_sentences = []
tic = time.time()
for l in progressbar.progressbar(range(len(sentences))):
    new_sentences.append(oov.get_best_sentence(sentences[l]))
print(time.time()-tic)

100% (132 of 132) |######################| Elapsed Time: 0:02:16 Time:  0:02:16


136.64666748046875


In [30]:
pred_trees = []
tic = time.time()
for l in progressbar.progressbar(range(len(sentences))):
    P,back  = P_CYK(new_sentences[l], pcfg) 
    S = un_chomsky(build_tree(back,sentences[l],pcfg))
    pred_trees.append(S)
print(time.time()-tic)

100% (132 of 132) |######################| Elapsed Time: 0:03:52 Time:  0:03:52


232.93611574172974


In [37]:
def get_not_parsed(s):
    c=0
    for l in s:
        if l=="(" or l==")":
            c+=1
    if c==4:
        return True
    return False      

In [48]:
from PYEVALB import parser as evalbparser

scorel = []
for i in range(len(true_trees)):
    if get_not_parsed(pred_trees[i]):
        continue
    gold = true_trees[i]
    test = pred_trees[i]
    gold_tree = evalbparser.create_from_bracket_string(gold[1:-1])
    test_tree = evalbparser.create_from_bracket_string(test[1:-1])
    score=scorer.Scorer()
    result = score.score_trees(gold_tree, test_tree)
    scorel.append(result.tag_accracy)

In [49]:
np.mean(scorel)

0.8502234726724525

In [23]:
S

'( (SENT (PONCT -) (NP (NPP Gérard) (NPP Longuet))))'

In [None]:
pred_POS = []
for s in prod:
    pred_POS.append([rule.lhs() for rule in s if rule.is_lexical()])
    
true_POS = []
for s in true:
    t = nltk.tree.Tree.fromstring(s)
    p = t.productions()
    true_POS.append([rule.lhs() for rule in p if rule.is_lexical()])
    
c  = 0
score = []
for p,t in zip(pred_POS,true_POS):
    if len(p)==0:
        c+=1 
        score.append(0)
    else:
        if len(p)!=len(t):
            print("alert")
        score.append(np.mean(t==p))
print(np.mean(score))

c=0
for pred in pred_POS:
    if len(pred)==0:
        c+=1
c

In [None]:
# pcfg + lexicon
#pcfg = PCFG()
#pcfg.fit_pcfg(train)
#pcfg.fit_lexicon(train)
#pcfg.fit_lexicon(train)
#pprint(pcfg.lexicon_)
#pcfg.lexicon_['autres']
#get_POS('malfaiteurs', pcfg)
#get_POS("anticoagulants", pcfg)

In [None]:
# test sentence
#sentence = sTrain[6]
#sentence[0] = 'Ammélioration' # mistake
#sentence[0] = "portiques" # polyglot but not in train
#sentence[-1] = "pprout" # polyglot but not in train
#print(sentence)
#nltk.tree.Tree.fromstring(train[6], remove_empty_top_bracketing=1)

In [None]:
#n = 97
#print(sTest[n])
#nltk.tree.Tree.fromstring(test[n], remove_empty_top_bracketing=1)

In [None]:
# embeddings
#path_emb = "polyglot-fr.pkl"
#w2v = Word2Vec(path_emb)
#print(len(w2v.words),w2v.embeddings.shape)
#w2v.extract_subset(vocabTrain)
#print(len(w2v.words),w2v.embeddings.shape)
#print(w2v.most_similar_embeddings("hôtel"))
#print(w2v.most_similar_levenshtein("hôtel", k=10))
#print(w2v.most_similar_levenshtein("hôtel",k=10, damerau=True))
#w = 'malfaiteurs'
#print(w2v.most_similar_embeddings(w))
#print(w2v.most_similar_levenshtein(w, k=10))
#print(w2v.most_similar_levenshtein(w,k=10, damerau=True))

In [None]:
#get_POS('malfaiteurs', pcfg, w2v)

In [None]:
#bigram = Automaton(N=2)
#bigram.fit(sTrain)
#print(bigram)
#bigram.graph["*a"].keys()
#bigram = Automaton(N=3)
#bigram.fit([["banane","baobab"]])
#print(len(bigram.vocab_))
#bigram.vocab_
#print(bigram)
#for w in bigram.vocab_:
#    if "$$" in w:
#        print(w)
        
#for w in bigram.vocab_:
#    if "*" in w:
#        print(w)

# Scoring

In [None]:
from PYEVALB import parser as evalbparser

In [None]:
true_bracket = corpus[3][2:-1]
gold_tree = evalbparser.create_from_bracket_string(true_bracket)
#test_tree = evalbparser.create_from_bracket_string(proposed_bracket)

In [None]:
nltk.tree.Tree.fromstring(true_bracket)

In [None]:
test_bracket = "(SENT (ADV Ensuite) (PONCT ,) (VN (V fut) (VPP installée)) (NP (DET une) (ADJ autre)) (NC forge) (PP (P à) (NP (DET la) (NPP Vacquerie))) (PONCT ,) (PP (P à) (NP (DET l') (NC emplacement) (ADV aujourd'_hui) (PP (P de) (NP (NPP Cora))))) (PONCT .))"
import nltk
nltk.tree.Tree.fromstring(test_bracket)

In [None]:
test_tree = evalbparser.create_from_bracket_string(test_bracket)

In [None]:
gold_tree.poss

In [None]:
test_tree.poss

In [None]:
[i==j for i,j in zip(test_tree.poss,gold_tree.poss)]

In [None]:
score=scorer.Scorer()
result = score.score_trees(gold_tree, test_tree)
result

In [None]:
from PYEVALB import scorer
from PYEVALB import parser

gold = '(IP (NP (PN 这里)) (VP (ADVP (AD 便)) (VP (VV 产生) (IP (NP (QP (CD 一) (CLP (M 个))) (DNP (NP (JJ 结构性)) (DEG 的)) (NP (NN 盲点))) (PU ：) (IP (VP (VV 臭味相投) (PU ，) (VV 物以类聚)))))) (PU 。))'
test = '(IP (IP (NP (PN 这里)) (VP (ADVP (AD 便)) (VP (VV 产生) (NP (QP (CD 一) (CLP (M 个))) (DNP (ADJP (JJ 结构性)) (DEG 的)) (NP (NN 盲点)))))) (PU ：) (IP (NP (NN 臭味相投)) (PU ，) (VP (VV 物以类聚))) (PU 。))'

gold_tree = parser.create_from_bracket_string(gold)
test_tree = parser.create_from_bracket_string(test)

score=scorer.Scorer()
result = score.score_trees(gold_tree, test_tree)
result

In [None]:
class Sentence():

    def __init__(self,sentence):
        self.sentence = sentence

    def __repr__(self):
        print(self.sentence)

    def get_tree(self):
        """build the CNF tree from the nested sentence"""
        tree = nltk.tree.Tree.fromstring(self.clean_sentence,remove_empty_top_bracketing=True)
        nltk.treetransforms.chomsky_normal_form(tree)
        nltk.treetransforms.collapse_unary(tree,collapsePOS=True)

        return tree

In [None]:
class Grammar():

    def __init__(self):
        self.pcfg = None
        self.lexicon = {}
        self.productions = []
        self.non_terminals = []

    def update(self,sentence):

        sentence = Sentence(sentence)
        tree = sentence.get_tree()
        for prod in tree.productions():
            if prod.is_lexical():
                label = prod._rhs[0]
                #append to lexicon
                if not label in self.lexicon: #words are the keys for the lexicon
                    self.lexicon.update({label:{}})
                if not prod._lhs in self.lexicon[label]:
                    self.lexicon[label].update({prod._lhs:0})
                self.lexicon[label][prod._lhs] += 1


            if prod.is_nonlexical():
                #append to pcfg
                self.productions.append(prod)

    def normalize_lexicon(self):
        if self.lexicon == {}:
            print('Lexicon empty')
            return None

        for k,d in self.lexicon.items():
            somme = sum(d.values())
            self.lexicon[k] = {i:v/somme for i,v in d.items()}


    def build(self,list_of_sentences):
        """list_of_sentences est par exemple le set de train"""
        for s in list_of_sentences:
            self.update(s)

        start = Nonterminal('SENT')
        self.pcfg = induce_pcfg(start,self.productions)
        self.pcfg.chomsky_normal_form(flexible = False)

        #normalize lexicon
        self.normalize_lexicon()

        #get tokens
        for prod in self.pcfg._productions:
            for token in prod._rhs:
                if not token=='SENT':
                    self.non_terminals.append(token)
        self.non_terminals.insert(0,start)

        #get tokens2index
        self.pos2index = {}
        for i,token in enumerate(self.non_terminals):
            self.pos2index[token] = i

In [None]:
def P_CYK(sentence,pcfg, allPOS):
    """ADAPT CODE AYMERIC !!!"""
    
    lexicon = pcfg.lexicon_
    #allPOS = list(set(pcfg.POS).union(pcfg.pcfg_.keys()))
    pos2index = {p:i for i,p in enumerate(allPOS)}
    pcfg = pcfg.pcfg_
    
    #split_sentence = sentence.split()
    
    #r = len(grammar.non_terminals) ### ?
    r = len(allPOS)
    
    #n = len(split_sentence)
    n = len(sentence)
    
    
    P = np.zeros((n,n,r))
    back = np.empty((n,n,r),dtype=np.ndarray)

    #for s,word in enumerate(split_sentence):
    for s,word in enumerate(sentence):
        #for pos in grammar.lexicon[word]:
        for pos in lexicon[word]:
            #v = grammar.pos2index[pos]
            v = pos2index[pos]
            #print(word)
            #print(lexicon[word])
            P[0,s,v] = lexicon[word][pos]
            
    #return P, allPOS
    
    #browse unaries -> i do not have any unari !!!
    #for s in range(n):
    #    for prod in grammar.pcfg._productions:
    #        if len(prod._rhs)==1:
    #            v1 = grammar.pos2index[prod._lhs]
    #            v2 = grammar.pos2index[prod._rhs[0]]
    #            prob_transition = P[0,s,v2] * prod._ProbabilisticMixIn__prob
    #            if P[0,s,v2]>0 and prob_transition > P[0,s,v1]: 
    #                P[0,s,v1] = prob_transition
    #                back[0,s,v1] = (0,v2,None)
                    
    #browse binary rules
    for l in range(2,n+1): #length of span
        for s in range(1,n-l+2): #start of span
            for p in range(0,l): #partition of span 
                #for prod in grammar.pcfg._productions:
                for lhs in pcfg:
                    for rhs in pcfg[lhs]:
                        #if len(prod._rhs)==2: #if binary rule
                        #a = grammar.pos2index[prod._lhs]
                        a = pos2index[lhs]
                        #Rb = prod._rhs[0]
                        Rb = rhs[0]
                        #Rc = prod._rhs[1]
                        Rc = rhs[1]
                        #b = grammar.pos2index[Rb]
                        b = pos2index[Rb]
                        #c = grammar.pos2index[Rc]
                        c = pos2index[Rc]
                        #prob_prod = prod._ProbabilisticMixIn__prob
                        prob_prod = pcfg[lhs][rhs]
                        
                        prob_splitting = prob_prod * P[p-1,s-1,b] * P[l-p-1,s+p-1,c]
                        if P[p-1,s-1,b] > 0 and P[l-p-1,s+p-1,c] > 0 and P[l-1,s-1,a] < prob_splitting:
                            #print(prod)
                            P[l-1,s-1,a] = prob_splitting
                            back[l-1,s-1,a] = (p,b,c)
    
    return P,back 

def build_tree(backp,sentence,grammar,length=-1,start=0,pos=0,is_splitted = False): 
    S = ''
    if not is_splitted: 
        sentence = sentence.split()
    indexes = backp[length,start,pos]
    if indexes is None: 
        return sentence[start]
    else: 
        p,b,c = indexes
        if c is None and not b is None: 
            #return "(" + grammar.non_terminals[b]._symbol+ " " + sentence[start] + ")"
            return "(" + grammar[b]+ " " + sentence[start] + ")"
        #print(non_terminals[pos],non_terminals[b],non_terminals[c])
        #S += "(" +grammar.non_terminals[b]._symbol+" "+ build_tree(backp,sentence,grammar,p-1,start,b,is_splitted = True) +")"
        S += "(" +grammar[b]+" "+ build_tree(backp,sentence,grammar,p-1,start,b,is_splitted = True) +")"
        #S += "(" +grammar.non_terminals[c]._symbol+" "+ build_tree(backp,sentence,grammar,length-p,start+p,c,is_splitted = True) +")" 
        S += "(" +grammar[c]+" "+ build_tree(backp,sentence,grammar,length-p,start+p,c,is_splitted = True) +")" 
            
    return S

def correct_string(string):
    return '( (SENT' + string + '))'

def un_chomsky(bracket_string):
    
    tree = nltk.tree.Tree.fromstring(correct_string(bracket_string))
    nltk.treetransforms.un_chomsky_normal_form(tree)
    result = tree.pformat().replace('\n','')
    return result