In [1]:
import numpy as np
import nltk
import torch
import sklearn
import re
import pyparsing
import copy

In [13]:
# Reading the provided file while removing the functional labels

file_name = "sequoia-corpus+fct.mrg_strict"

data = []
with open("./" + file_name, "r") as f:
    for sentence in f:
        sentence = sentence.strip()
        #sentence = filter_labels(sentence) ,can be done later and more easily   
        data.append(sentence)

In [14]:
data = data[1:2]

In [15]:
print(data[0])

( (SENT (NP-SUJ (DET Cette) (NC exposition)) (VN (CLO-A_OBJ nous) (V apprend)) (Ssub-OBJ (CS que) (PP-MOD (P dès) (NP (DET le) (ADJ XIIe) (NC siècle))) (PONCT ,) (PP-MOD (P à) (NP (NPP Dammarie-sur-Saulx))) (PONCT ,) (PP-MOD (P entre) (NP (ADJ autres) (NC sites))) (PONCT ,) (NP-SUJ (DET une) (NC industrie) (AP (ADJ métallurgique))) (VN (V existait))) (PONCT .)))


In [16]:
def parse(expr):
    def _helper(s):
        items = []
        word = []
        for item in s:
            
            if word and item == ')': # Terminal token (aka. natural word)
                word = ''.join(word).lower()
                items.append(word)
                word = []
            
            elif word and (item == '(' or item == ' '): # Tags
                word = ''.join(word)
                items.append(word)
                word = []
                
            if item == '(':
                result, closeparen = _helper(s)
                if not closeparen:
                    raise ValueError("bad expression -- unbalanced parentheses")
                items.append(result)
            
            elif item == ')':
                return items, True
            
            elif item != ' ':
                word.append(item)
        
        return items, False
    return _helper(iter(expr))[0][0][0]

In [17]:
def untag(expr):
    def _helper(s):
        items = []
        word = []
        for item in s:
            
            if word and item == ')':
                word = ''.join(word)
                items.append(word)
                word = []
            
            elif word and (item == '(' or item == ' '):
                word = []
                
            if item == '(':
                result, closeparen = _helper(s)
                if not closeparen:
                    raise ValueError("bad expression -- unbalanced parentheses")
                for elmnt in result:
                    items.append(elmnt)
                    
            elif item == ')':
                return items, True
            
            elif item != ' ':
                word.append(item)
        
        return items, False
    return _helper(iter(expr))[0]

In [18]:
def get_untagged_sentences(sentences):
    """Creates file and variable containing untagged sentences"""
    
    result = []
    with open("sentences.txt", "w") as f:
        for s in sentences:
            parentheses_stack = []
            tags_stack = []
            s = untag(s)
            s = ' '.join(s)
            f.write(s)
            result.append(s)
            
    return result

In [19]:
untagged_sentences = get_untagged_sentences(data)

In [20]:
root_tags = set()

In [21]:
def get_tags(s, d, tag_set):
    """Adds the current node's parent/child relation count to the probability dictionary"""
    tag_name = s[0].split("-")[0] # Remove any hiphen on tags
    if len(s) == 2 and isinstance(s[1], str):
        return
    root_tags.add(tag_name)
    childs = s[1:]
    child_tags = None
    
    for c in childs:
        child_tag_name = c[0].split("-")[0]
        tag_set.add(child_tag_name)
        if child_tags is None:
            child_tags =  child_tag_name
        else:
            child_tags = ','.join([child_tags, child_tag_name])
        get_tags(c, d, tag_set)
    d[tag_name] = d.get(tag_name, {})
    d[tag_name][child_tags] = d[tag_name].get(child_tags, 0) + 1
    

In [22]:
def build_pcfg(sentences, tag_set):
    """Builds the PCFG using the tagged sentences and populates a tag_set"""

    prob_dict = dict()

    for s in sentences:
        parentheses_stack = []
        tags_stack = []
        s = parse(s)
        get_tags(s, prob_dict, tag_set)

    for k1, d in prob_dict.items():
        tag_set.add(k1)
        tot_count = max(sum(d.values()), 1)
        for k2, v in d.items():
            d[k2] = v/tot_count

    return prob_dict

In [23]:
tag_set = set()

pcfg = build_pcfg(data, tag_set)
print(len(pcfg))
print(len(root_tags))
print(len(tag_set))
print(tag_set)

6
6
15
{'NP', 'CS', 'V', 'NC', 'ADJ', 'AP', 'NPP', 'PP', 'P', 'VN', 'Ssub', 'PONCT', 'CLO', 'SENT', 'DET'}


In [25]:
pcfg

{'NP': {'DET,NC': 0.2,
  'DET,ADJ,NC': 0.2,
  'NPP': 0.2,
  'ADJ,NC': 0.2,
  'DET,NC,AP': 0.2},
 'VN': {'CLO,V': 0.5, 'V': 0.5},
 'PP': {'P,NP': 1.0},
 'AP': {'ADJ': 1.0},
 'Ssub': {'CS,PP,PONCT,PP,PONCT,PP,PONCT,NP,VN': 1.0},
 'SENT': {'NP,VN,Ssub,PONCT': 1.0}}

In [26]:
def get_word_counts(s, d):
    """Adds the word count to the lexicon dictionary"""
    tag_name = s[0].split("-")[0] # Do not remove hyphens
    if len(s) == 2 and isinstance(s[1], str):
        word = s[1].lower()
        d[word] = d.get(word, {})
        d[word][tag_name] = d[word].get(tag_name, 0) + 1
        return
    childs = s[1:]
    for c in childs:
        get_word_counts(c, d)

In [27]:
def build_prob_lexicon(sentences, tag_set):
    """Builds a probabilistic lexicon using the tagged sentences"""
    
    lexicon_dict = dict()
    
    for s in sentences:
        parentheses_stack = []
        tags_stack = []
        s = parse(s)
        get_word_counts(s, lexicon_dict)
    
    for _, d in lexicon_dict.items():
        tot_count = max(sum(d.values()), 1)
        for k2, v in d.items():
            tag_set.add(k2)
            d[k2] = v/tot_count
    
    return lexicon_dict

In [28]:
lexicon = build_prob_lexicon(data, tag_set)
print(lexicon)

{'cette': {'DET': 1.0}, 'exposition': {'NC': 1.0}, 'nous': {'CLO': 1.0}, 'apprend': {'V': 1.0}, 'que': {'CS': 1.0}, 'dès': {'P': 1.0}, 'le': {'DET': 1.0}, 'xiie': {'ADJ': 1.0}, 'siècle': {'NC': 1.0}, ',': {'PONCT': 1.0}, 'à': {'P': 1.0}, 'dammarie-sur-saulx': {'NPP': 1.0}, 'entre': {'P': 1.0}, 'autres': {'ADJ': 1.0}, 'sites': {'NC': 1.0}, 'une': {'DET': 1.0}, 'industrie': {'NC': 1.0}, 'métallurgique': {'ADJ': 1.0}, 'existait': {'V': 1.0}, '.': {'PONCT': 1.0}}


In [29]:
def get_tag_counts(s, d):
    """Adds the tag count to the tag dictionary"""
    tag_name = s[0] # Do not remove hyphens
    if len(s) == 2 and isinstance(s[1], str):
        d[tag_name] = d.get(tag_name, 0) + 1 
        return
    childs = s[1:]
    for c in childs:
        get_tag_counts(c, d)

In [30]:
def build_tag_frequencies(sentences):
    """Builds a tag frequency list"""
    
    tag_dict = dict()
    
    for s in sentences:
        parentheses_stack = []
        tags_stack = []
        s = parse(s)
        get_tag_counts(s, tag_dict)
        
    tot_count = max(sum(tag_dict.values()), 1)
    for k1, v in tag_dict.items():
        tag_dict[k1] = v/tot_count
    
    return tag_dict

In [31]:
tag_freq = build_tag_frequencies(data)
print(tag_freq)

{'DET': 0.13636363636363635, 'NC': 0.18181818181818182, 'CLO-A_OBJ': 0.045454545454545456, 'V': 0.09090909090909091, 'CS': 0.045454545454545456, 'P': 0.13636363636363635, 'ADJ': 0.13636363636363635, 'PONCT': 0.18181818181818182, 'NPP': 0.045454545454545456}


In [32]:
def remove_unit(cfg, A, B, tag_set):
    """Replaces the Unit rules by Multiple Rules by chaining"""
    
    d = cfg[A]
    copy_d = copy.deepcopy(cfg[A])
    freq = copy_d[B]
    copy_lexicon = copy.deepcopy(lexicon)
    
    # Create new rules for telescoping
    if B not in cfg: # If preterminal tag
        if A != "SENT":
            new_key = A + "&" + B
            tag_set.add(new_key)
            for word, tags in copy_lexicon.items():
                for tag, prob in tags.items():
                    if tag == B:
                        lexicon[word][new_key] = prob * freq
            
            for k1, prob in copy_d.items():
                keys = k1.split(',')
                if len(keys) == 2 and A == keys[1]:
                    keys = ','.join([keys[0],new_key])
                    cfg[k1].update({keys : prob * freq})
    
    else: # If not preterminal tag
        for tags, prob in cfg[B].items():
            if len(tags) == 2:
                cfg[A][tags] = prob * freq 
        
    del d[B]
    return True

In [33]:
def remove_multi(cfg, A, B, idx, count, tag_set):
    """Replaces the Multiple Rules with chains of Double or Unit Rules"""
    tags = B.split(',')
    n = len(tags)
    d = cfg[A]
    
    new_tag = A + str(count)
    count += 1
    new_key = ','.join([tags[0], new_tag])
    d[new_key] = copy.deepcopy(d[B])
    del d[B]
    
    for i in range(1, n-2): 
        tag_set.add(new_tag)
        current_key = new_tag
        new_tag = A + str(count)
        count += 1
        cfg[current_key] =  {','.join([tags[i], new_tag]): d[new_key]}
    
    tag_set.add(new_tag)
    cfg[new_tag] = {tags[-1] : d[new_key]}
    
    return count

In [34]:
def chomskyfy(context_free_grammar, tag_set):
    """Binarisation of a CFG. Starts with removing Unit and then Multi Rules"""
    cfg = copy.deepcopy(context_free_grammar)
      
    count = 0
    copy_cfg = copy.deepcopy(cfg) # Avoid Dictionnary size change during iteration
    for k1, d1 in copy_cfg.items():
        copy_d1 = copy.deepcopy(d1) # Avoid Dictionnary size change during iteration
        for i, (k2, prob) in enumerate(copy_d1.items()):
            tags = k2.split(',')
            if len(tags) > 2:
                count = remove_multi(cfg, k1, k2, i, count, tag_set)
    
    return cfg

    copy_cfg = copy.deepcopy(cfg) # Avoid Dictionnary size change during iteration
    for k1, d1 in copy_cfg.items():
        copy_d1 = copy.deepcopy(d1) # Avoid Dictionnary size change during iteration
        for k2, prob in copy_d1.items():
            tags = k2.split(',')
            if len(tags) == 1 and tags[0] != k1: #Avoid loops 
                # Check that we have unit rules that we can remove
                remove_unit(cfg, k1, tags[0], tag_set)
    
    return cfg

In [36]:
binary_pcfg = chomskyfy(pcfg, tag_set)
binary_pcfg

{'NP': {'DET,NC': 0.2,
  'NPP': 0.2,
  'ADJ,NC': 0.2,
  'DET,NP0': 0.2,
  'DET,NP2': 0.2},
 'VN': {'CLO,V': 0.5, 'V': 0.5},
 'PP': {'P,NP': 1.0},
 'AP': {'ADJ': 1.0},
 'Ssub': {'CS,Ssub4': 1.0},
 'SENT': {'NP,SENT12': 1.0},
 'NP0': {'ADJ,NP1': 0.2},
 'NP1': {'NC': 0.2},
 'NP2': {'NC,NP3': 0.2},
 'NP3': {'AP': 0.2},
 'Ssub4': {'PP,Ssub5': 1.0},
 'Ssub5': {'PONCT,Ssub6': 1.0},
 'Ssub6': {'PP,Ssub7': 1.0},
 'Ssub7': {'PONCT,Ssub8': 1.0},
 'Ssub8': {'PP,Ssub9': 1.0},
 'Ssub9': {'PONCT,Ssub10': 1.0},
 'Ssub10': {'NP,Ssub11': 1.0},
 'Ssub11': {'VN': 1.0},
 'SENT12': {'VN,SENT13': 1.0},
 'SENT13': {'Ssub,SENT14': 1.0},
 'SENT14': {'PONCT': 1.0}}

In [102]:
### CODE FROM THE POLYGLOT TUTORIAL ###

from operator import itemgetter
import re
import pickle

words, embeddings = pickle.load(open('polyglot-fr.pkl', 'rb'), encoding='latin1')

# Map words to indices and vice versa
intersection_words = list()
intersection_embeddings = list()

for i, w in enumerate(words):
    if w in lexicon:
        intersection_words.append(w)
        
word_id = {w:i for (i, w) in enumerate(words)}
id_word = dict(enumerate(embeddings))

intersection_embeddings = np.array(intersection_embeddings)
intersection_words = np.array(intersection_words)


# Normalize digits by replacing them with #
DIGITS = re.compile("[0-9]", re.UNICODE)


def case_normalizer(word, dictionary):
    """ In case the word is not available in the vocabulary,
     we can try multiple case normalizing procedure.
     We consider the best substitute to be the one with the lowest index,
     which is equivalent to the most frequent alternative."""
    w = word
    lower = (dictionary.get(w.lower(), 1e12), w.lower())
    upper = (dictionary.get(w.upper(), 1e12), w.upper())
    title = (dictionary.get(w.title(), 1e12), w.title())
    results = [lower, upper, title]
    results.sort()
    index, w = results[0]
    if index != 1e12:
        return w
    return word


def normalize(word, word_id):
    """ Find the closest alternative in case the word is OOV."""
    if not word in word_id:
        word = DIGITS.sub("#", word)
    if not word in word_id:
        word = case_normalizer(word, word_id)

    if not word in word_id:
        return None
    return word


def l2_nearest(embeddings, words, word_index, k):
    """Sorts words according to their Euclidean distance.
       To use cosine distance, embeddings has to be normalized so that their l2 norm is 1."""

    e1 = embeddings[word_index]
    distances = []
    for w2 in words:
        e2 = embeddings[word_id[w2]]
        distances.append(e1.dot(e2) / (np.linalg.norm(e1) * np.linalg.norm(e2)))
    sorted_distances = sorted(enumerate(distances), key=itemgetter(1))
    return zip(*sorted_distances[:k])


def knn(word, words, embeddings, word_id, id_word, k=5):
    word = normalize(word, word_id)
    if not word:
        print("OOV word")
        return None, None
    word_index = word_id[word]
    indices, distances = l2_nearest(embeddings, words, word_index, k)
    neighbors = [id_word[idx] for idx in indices]
    
    return neighbors, distances

In [103]:
def levenstein_dist(w1, w2):
    """Computes the levenstein distance between w1 and w2"""
    
    m, n  = len(w1),len(w2)
    
    result = np.zeros((m, n))
    
    for i in range(m):
        result[i, 0] = i
    for i in range(n):
        result[0, i] = i
        
    for i, c1 in enumerate(w1):
        for j, c2 in enumerate(w2):
            if c1 == c2:
                result[i, j] = min(result[i-1, j] + 1,
                                   result[i, j-1] + 1,
                                   result[i-1, j-1],
                                  )
            else:
                result[i, j] = min(result[i-1, j] + 1,
                                   result[i, j-1] + 1,
                                   result[i-1, j-1] + 1,
                                  )
    return result[m-1, n-1]

In [104]:
def handle_oov(word, k=10):
    
    lev_dist = []
    for w2 in intersection_words:
        lev_dist.append(levenstein_dist(word, w2))
    min_indices = np.argpartition(lev_dist, k)[:k]
    close_words = intersection_words[min_indices]
    
    neighbors, distances = knn(word, close_words, embeddings, word_id, id_word)
    
    if neighbors is None:
        return None
    
    return close_words[np.argmin(distances)]

In [105]:
tag_list = np.array(list(tag_set))
tag_to_idx = {tag: int(idx) for (idx, tag) in enumerate(tag_list)}

In [137]:
def row_info(prob_matrix, m, n, k=25):
    print("ROW ",m)
    for i in range(n - m):
        print(tag_list[np.argsort(prob_matrix[m, i])[-k:]])
        print(prob_matrix[m, i, np.argsort(prob_matrix[m, i])[-k:]])
        print()
    print()

In [138]:
def compute_cyk(words, use_oov=False):
    """Computes the CYK matrix for a sentence"""
    
    n = len(words)
    
    cyk_matrix = np.zeros((n, n, len(tag_list), 3), dtype="int16") # Backtracking Matrix
    prob_matrix = np.zeros((n, n, len(tag_list)))
    
    # Unit words handling
    for i, w in enumerate(words):
        token_to_tag = w

        if not w in lexicon:
            if use_oov: 
                print(w + " is an OOV")
            token_to_tag = handle_oov(w)
            if use_oov:
                if token_to_tag is None:
                    print("No closest token found\n")
                else:
                    print("Closest token found :", token_to_tag, "\n")
        
        if token_to_tag is None:
            for tag, prob in tag_freq.items():
                idx = tag_to_idx.get(tag)
                if not idx is None:  # avoid the case where tag appearing in lexicon but not in grammar rules
                    prob_matrix[0, i, idx] = prob
            
        else:
            max_tag, max_prob = -1, -1
            for tag, prob in lexicon[token_to_tag].items():
                idx = tag_to_idx.get(tag)
                if not idx is None:
                    prob_matrix[0, i , idx] = prob
        
        for parent_tag, children in binary_pcfg.items():
            parent_tag_idx = tag_to_idx[parent_tag]
            for child_tag, prob in children.items():
                child_tag = child_tag.split(',')
                if len(child_tag) > 1:
                    continue
                child_tag_idx = tag_to_idx[child_tag[0]]
                if prob_matrix[0, i , child_tag_idx] > 0:
                    prob_matrix[0, i, parent_tag_idx] = prob
    
    row_info(prob_matrix, 0,  n)
    
    # Strings of length 2 and more 
    for l in range(1, n):
        for s in range(n - l):
            for parent_tag, children in binary_pcfg.items():
                for children_tags, prob in children.items():
                    children_tags = children_tags.split(',')
                    if len(children_tags) < 2:
                        continue
                    parent_tag_idx = tag_to_idx[parent_tag]
                    left_tag_idx = tag_to_idx[children_tags[0]]
                    right_tag_idx = tag_to_idx[children_tags[1]]
                    prob_prod = prob
                    for partition in range(l):
                        prob_left = prob_matrix[partition, s, left_tag_idx]
                        prob_right = prob_matrix[l - partition - 1, s + partition + 1, right_tag_idx]
                    
                        prob_split = prob_prod  * prob_left * prob_right
                        if prob_split > prob_matrix[l, s, parent_tag_idx]:
                            prob_matrix[l, s, parent_tag_idx] = prob_split
                            cyk_matrix[l, s, parent_tag_idx]  = [partition, left_tag_idx, right_tag_idx]
        
        row_info(prob_matrix, l,  n)

    result = cyk_matrix[n-1, 0, tag_to_idx['SENT']]
    print(result[1:])
    print(tag_list[int(result[1])])
    print(tag_list[int(result[2])])
    for i in reversed(range(n)):
        tags = []
        for j in range(n - i):
            idx = np.argmax(prob_matrix[i, j])
            tags.append(tag_list[idx])
        print(tags)
        
    return cyk_matrix, prob_matrix

In [139]:
def parse_cyk_output(cyk, root_idx, words):
    return parse_substring(0, len(words) - 1, root_idx, words, cyk)

In [140]:
def parse_substring(s, l, idx_root_tag, sentence, cyk):
        # parse substring beginning at index s of sentence, of length l+1, and tagged as idx_root_tag

        if l == 0:
            return sentence[s]

        else:  # split enabling to reach max_proba_derivation[s,l,idx_root_tag]
            cut = int(cyk[l, s, idx_root_tag, 0])
            idx_left_tag = int(cyk[l, s, idx_root_tag, 1])
            idx_right_tag = int(cyk[l, s, idx_root_tag, 2])

            left_tag = tag_list[idx_left_tag]
            right_tag = tag_list[idx_right_tag]

            return [[left_tag, parse_substring(s, cut, idx_left_tag, sentence, cyk)],
                    [right_tag, parse_substring(s + cut + 1, l - cut - 1, idx_right_tag, sentence, cyk)]]

In [141]:
def format_parsing(parsing):    
    if isinstance(parsing, str):
        return parsing
    
    else:
        result = ""
        for p in parsing:
            tag = p[0]
            to_parse = p[1]
            parsed = format_parsing(to_parse)
            result += "(" + tag + " " + parsed + ")" + " "
        result = result[:-1]
        return result

In [142]:
def remove_artificial_tokens(s):
    """Remove added tags and de-telescope tags"""
    
    s = re.sub(r'\(([A-Za-z\+\&]+[1-9]+)(\s)(?=\()' , '', s) # Remove numbered Tags (artificial tags)
    s = re.sub(r' +' , ' ', s) 
    parenthesis = 0
    i = 0
    while i < len(s):
        if s[i] == '(':
            parenthesis += 1
        elif s[i] == ')':
            parenthesis -= 1
        if parenthesis < 0:
            s = ''.join([s[:i-1], s[i:]])
            parenthesis ++ 1
        else:
            i += 1
        
    count = 0
    for i in range(2, len(s)):
        if i < len(s)-1 and s[i] == '&' :
            s = s[:i] + ' (' + s[i+1:]
            count += 1
        elif s[i] == ')' and count > 0:
            s = s[:i+1] + count*')' + s[i+1:]
            count = 0
    return s

In [143]:
def get_parsed_output(sentence, use_oov=False):
    
    words = sentence.lower().split(' ')
    n = len(words)
    
    if n > 1:
        cyk, probs = compute_cyk(words, use_oov=use_oov)
        root_idx = tag_to_idx['SENT']
        if probs[n-1, 0, root_idx] == 0:
            return None
        parsed_output = parse_cyk_output(cyk, root_idx, words)
    
    else:
        word = words[0]
        token = word
        if not token in lexicon:
            token = handle_oov(word)
        if token is None:
            max(tag_freq, key= lambda x: x[1])
        else:
            tag = max(lexicon[token])
        parsed_output = "(" + tag + " " + token + ")"
        
    parsed_output = format_parsing(parsed_output)
    parsed_output = remove_artificial_tokens(parsed_output)
            
    return "( (SENT " + parsed_output + "))"

In [144]:
def print_eg(idx):
    print(untagged_sentences[idx])
    print(get_parsed_output(untagged_sentences[idx], use_oov=True))

In [145]:
def print_entry(entry):
    if isinstance(entry, str):
        print(binary_pcfg[entry])
    else:
        for e in entry:
            print(binary_pcfg[e])

In [146]:
print_eg(6)

Amélioration de la sécurité
ROW  0
['NP773&NC' 'NP773' 'NP470&NC' 'NP54' 'NP54&NC' 'NP929' 'NP929&NC' 'NP338'
 'NP202' 'NP338&NC' 'NP202&NC' 'NP497&NC' 'NP497' 'NP506' 'NP506&NC'
 'NP35&NC' 'NP35' 'NP211&NC' 'NP211' 'NP453' 'NP453&NC' 'NP1' 'NP1&NC'
 'NP&NC' 'NC']
[3.89429764e-04 3.89429764e-04 3.89429764e-04 4.45062587e-04
 4.45062587e-04 5.56328234e-04 5.56328234e-04 7.78859527e-04
 7.78859527e-04 7.78859527e-04 7.78859527e-04 1.27955494e-03
 1.27955494e-03 2.33657858e-03 2.33657858e-03 2.67037552e-03
 2.67037552e-03 2.94853964e-03 2.94853964e-03 5.50764951e-03
 5.50764951e-03 1.35744089e-02 1.35744089e-02 1.44033380e-01
 1.00000000e+00]

['NP3424' 'SENT8003' 'SENT8144' 'Ssub11464' 'NP1299' 'NP3382' 'SENT8376'
 'SENT4391' 'NP3368' 'SENT8403' 'NP1412' 'NP3203' 'Ssub12000' 'NP1716'
 'Srel12936' 'NP1399' 'SENT6735' 'SENT9457' 'Ssub11860' 'SENT5193&PONCT'
 'NP&DET' 'P+D' 'PP&P' 'DET' 'P']
[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.000000