In [1]:
class Sentence():

    def __init__(self,sentence):
        self.sentence = sentence

    def __repr__(self):
        print(self.sentence)

    def get_tree(self):
        """build the CNF tree from the nested sentence"""
        tree = nltk.tree.Tree.fromstring(self.clean_sentence,remove_empty_top_bracketing=True)
        nltk.treetransforms.chomsky_normal_form(tree)
        nltk.treetransforms.collapse_unary(tree,collapsePOS=True)

        return tree

In [2]:
class Grammar():

    def __init__(self):
        self.pcfg = None
        self.lexicon = {}
        self.productions = []
        self.non_terminals = []

    def update(self,sentence):

        sentence = Sentence(sentence)
        tree = sentence.get_tree()
        for prod in tree.productions():
            if prod.is_lexical():
                label = prod._rhs[0]
                #append to lexicon
                if not label in self.lexicon: #words are the keys for the lexicon
                    self.lexicon.update({label:{}})
                if not prod._lhs in self.lexicon[label]:
                    self.lexicon[label].update({prod._lhs:0})
                self.lexicon[label][prod._lhs] += 1


            if prod.is_nonlexical():
                #append to pcfg
                self.productions.append(prod)

    def normalize_lexicon(self):
        if self.lexicon == {}:
            print('Lexicon empty')
            return None

        for k,d in self.lexicon.items():
            somme = sum(d.values())
            self.lexicon[k] = {i:v/somme for i,v in d.items()}


    def build(self,list_of_sentences):
        """list_of_sentences est par exemple le set de train"""
        for s in list_of_sentences:
            self.update(s)

        start = Nonterminal('SENT')
        self.pcfg = induce_pcfg(start,self.productions)
        self.pcfg.chomsky_normal_form(flexible = False)

        #normalize lexicon
        self.normalize_lexicon()

        #get tokens
        for prod in self.pcfg._productions:
            for token in prod._rhs:
                if not token=='SENT':
                    self.non_terminals.append(token)
        self.non_terminals.insert(0,start)

        #get tokens2index
        self.pos2index = {}
        for i,token in enumerate(self.non_terminals):
            self.pos2index[token] = i

In [None]:
def P_CYK(sentence,grammar):
    
    split_sentence = sentence.split()
    r = len(grammar.non_terminals) 
    n = len(split_sentence)
    P = np.zeros((n,n,r))
    back = np.empty((n,n,r),dtype=np.ndarray)

    
    for s,word in enumerate(split_sentence):
        for pos in grammar.lexicon[word]:
            v = grammar.pos2index[pos]
            P[0,s,v] = grammar.lexicon[word][pos]
    
    #browse unaries
    for s in range(n):
        for prod in grammar.pcfg._productions:
            if len(prod._rhs)==1:
                v1 = grammar.pos2index[prod._lhs]
                v2 = grammar.pos2index[prod._rhs[0]]
                prob_transition = P[0,s,v2] * prod._ProbabilisticMixIn__prob
                if P[0,s,v2]>0 and prob_transition > P[0,s,v1]: 
                    P[0,s,v1] = prob_transition
                    back[0,s,v1] = (0,v2,None)
                    
    #browse binary rules
    for l in range(2,n+1): #length of span
        for s in range(1,n-l+2): #start of span
            for p in range(0,l): #partition of span 
                for prod in grammar.pcfg._productions:
                    if len(prod._rhs)==2: #if binary rule
                        a = grammar.pos2index[prod._lhs]
                        Rb = prod._rhs[0]
                        Rc = prod._rhs[1]
                        b = grammar.pos2index[Rb]
                        c = grammar.pos2index[Rc]
                        prob_prod = prod._ProbabilisticMixIn__prob
                        
                        prob_splitting = prob_prod * P[p-1,s-1,b] * P[l-p-1,s+p-1,c]
                        if P[p-1,s-1,b] > 0 and P[l-p-1,s+p-1,c] > 0 and P[l-1,s-1,a] < prob_splitting:
                            #print(prod)
                            P[l-1,s-1,a] = prob_splitting
                            back[l-1,s-1,a] = (p,b,c)
    
    return P,back 

def build_tree(backp,sentence,grammar,length=-1,start=0,pos=0,is_splitted = False): 
    S = ''
    if not is_splitted: 
        sentence = sentence.split()
    indexes = backp[length,start,pos]
    if indexes is None: 
        return sentence[start]
    else: 
        p,b,c = indexes
        if c is None and not b is None: 
            return "(" + grammar.non_terminals[b]._symbol+ " " + sentence[start] + ")"
        #print(non_terminals[pos],non_terminals[b],non_terminals[c])
        S += "(" +grammar.non_terminals[b]._symbol+" "+ build_tree(backp,sentence,grammar,p-1,start,b,is_splitted = True) +")"
        S += "(" +grammar.non_terminals[c]._symbol+" "+ build_tree(backp,sentence,grammar,length-p,start+p,c,is_splitted = True) +")" 
            
    return S

def correct_string(string):
    return '( (SENT' + string + '))'

def un_chomsky(bracket_string):
    
    tree = nltk.tree.Tree.fromstring(correct_string(bracket_string))
    nltk.treetransforms.un_chomsky_normal_form(tree)
    result = tree.pformat().replace('\n','')
    return result

In [None]:
def correct_string(string):
    return '( (SENT' + string + '))'

def un_chomsky(bracket_string):
    
    tree = nltk.tree.Tree.fromstring(correct_string(bracket_string))
    nltk.treetransforms.un_chomsky_normal_form(tree)
    result = tree.pformat().replace('\n','')
    return result

P,back = P_CYK(new_sentence,pcfg)
parsed_sentence = un_chomsky(build_tree(back,sentence,pcfg))

In [None]:
def evaluate(sentence,reference):
    gold_tree = parser.create_from_bracket_string(sentence[1:-1])
    test_tree = parser.create_from_bracket_string(reference[1:-1])

    s = Scorer()
    result = s.score_trees(gold_tree, test_tree)
    
    return result.tag_accracy