In [1]:
import numpy as np
import nltk
import torch
import sklearn
import re
import pyparsing
import copy


In [2]:
# Reading the provided file while removing the functional labels

file_name = "sequoia-corpus+fct.mrg_strict"

data = []
with open("./" + file_name, "r") as f:
    for sentence in f:
        sentence = sentence.strip()
        #sentence = filter_labels(sentence) ,can be done later and more easily   
        data.append(sentence)

In [3]:
print(data[0])

( (SENT (NP (NPP Gutenberg))))


In [4]:
def parse(expr):
    def _helper(s):
        items = []
        word = []
        for item in s:
            
            if word and (item == ' ' or item == ')' or item == '('):
                word = ''.join(word)
                items.append(word)
                word = []
                
            if item == '(':
                result, closeparen = _helper(s)
                if not closeparen:
                    raise ValueError("bad expression -- unbalanced parentheses")
                items.append(result)
            
            elif item == ')':
                return items, True
            
            elif item != ' ':
                word.append(item)
        
        return items, False
    return _helper(iter(expr))[0][0][0]

In [5]:
def untag(expr):
    def _helper(s):
        items = []
        word = []
        for item in s:
            
            if word and item == ')':
                word = ''.join(word)
                items.append(word)
                word = []
            
            elif word and (item == '(' or item == ' '):
                word = []
                
            if item == '(':
                result, closeparen = _helper(s)
                if not closeparen:
                    raise ValueError("bad expression -- unbalanced parentheses")
                for elmnt in result:
                    items.append(elmnt)
            
            elif item == ')':
                return items, True
            
            elif item != ' ':
                word.append(item)
        
        return items, False
    return _helper(iter(expr))[0]

In [6]:
def get_untagged_sentences(sentences):
    """Creates file and variable containing untagged sentences"""
    
    result = []
    with open("sentences.txt", "w") as f:
        for s in sentences:
            parentheses_stack = []
            tags_stack = []
            s = untag(s)
            s = ' '.join(s)
            f.write(s)
            result.append(s)
            
    return result

In [7]:
untagged_sentences = get_untagged_sentences(data)
print(untagged_sentences)

['Gutenberg', 'Cette exposition nous apprend que dès le XIIe siècle , à Dammarie-sur-Saulx , entre autres sites , une industrie métallurgique existait .', "à_peu_près au même moment que Gutenberg inventait l' imprimerie , Gillet Bonnemire créait en 1450 la première forge à Saint-Dizier , à l' actuel emplacement du CHS .", "Ensuite , fut installée une autre forge à la Vacquerie , à l' emplacement aujourd'_hui de Cora .", "En 1953 , les hauts fourneaux et fonderies de Cousances virent le jour , puis Jean Baudesson , maire échevin de Saint-Dizier , autorisé par lettres patentes d' Henri IV , installa à Marnaval - qui signifiait val ou vallée de la Marne ou_bien en aval de la Marne - , une forge qui connut son apogée au XIXe siècle .", 'Tout_au_long_des années , de nouveaux sites furent créés , notamment à Haironville , puis plus_tard à Ancerville .', 'Amélioration de la sécurité', "Le maire a invité les membres du conseil à élaborer le programme d' amélioration de la voirie communale et d

In [8]:
def get_tags(s, d):
    """Adds the current node's parent/child relation count to the probability dictionary"""
    tag_name = s[0].split("-")[0] # Remove any hiphen on tags
    if len(s) == 2 and isinstance(s[1], str):
        return
    childs = s[1:]
    child_tags = None
    
    for c in childs:
        if len(c) == 2 and isinstance(c[1], str):
            child_tag_name = c[0]
        else:
            child_tag_name = c[0].split("-")[0]
            if child_tag_name == 'CLS':
                print(c)
        if child_tags is None:
            child_tags =  child_tag_name
        else:
            child_tags = ','.join([child_tags, child_tag_name])
        get_tags(c, d)
    d[tag_name] = d.get(tag_name, {})
    d[tag_name][child_tags] = d[tag_name].get(child_tags, 0) + 1
    

In [9]:
def build_pcfg(sentences, tag_set):
    """Builds the PCFG using the tagged sentences and populates a tag_set"""
    
    prob_dict = dict()
    
    for s in sentences:
        parentheses_stack = []
        tags_stack = []
        s = parse(s)
        get_tags(s, prob_dict)
    
    for k1, d in prob_dict.items():
        tag_set.add(k1)
        tot_count = max(sum(d.values()), 1)
        for k2, v in d.items():
            tag_set.add(k2)
            d[k2] = v/tot_count
    
    return prob_dict

In [10]:
tag_set = set()

pcfg = build_pcfg(data, tag_set)
print(pcfg)

{'NP': {'NPP': 0.061752433936022255, 'DET,NC': 0.1521557719054242, 'DET,ADJ,NC': 0.013574408901251738, 'ADJ,NC': 0.006787204450625869, 'DET,NC,AP': 0.040723226703755215, 'ADJ,NC,Ssub': 5.5632823365785815e-05, 'NPP,NPP': 0.021307371349095966, 'NC': 0.14403337969401947, 'DET,ADJ,NC,PP': 0.005285118219749653, 'DET,NPP': 0.016689847009735744, 'DET,NC,ADV,PP': 0.0003894297635605007, 'NC,PP': 0.054687065368567454, 'DET,ADJ,NC,COORD': 0.0003894297635605007, 'NC,NC,PP': 0.00033379694019471486, 'NPP,AP': 0.001780250347705146, 'NC,AP,PP': 0.007510431154381085, 'NPP,NPP,PONCT,NP,PONCT,VPpart': 0.00022253129346314326, 'PROREL': 0.02909596662030598, 'NC,COORD,PP,COORD': 5.5632823365785815e-05, 'NPP,PONCT,Srel,PONCT': 0.00011126564673157163, 'DET,NC,Srel': 0.0082336578581363, 'DET,NC,PP': 0.10019471488178025, 'DET,NC,NC': 0.002670375521557719, 'DET,NC,PP,PP': 0.010403337969401948, 'DET,NPP,PP': 0.0022809457579972183, 'DET,ADV,ADJ,VPinf': 0.00011126564673157163, 'DET,NC,NP': 0.0013908205841446453, 'N

In [11]:
def get_word_counts(s, d):
    """Adds the word count to the lexicon dictionary"""
    tag_name = s[0] # Do not remove hyphens
    if len(s) == 2 and isinstance(s[1], str):
        word = s[1].lower()
        d[word] = d.get(word, {})
        d[word][tag_name] = d[word].get(tag_name, 0) + 1
        return
    childs = s[1:]
    for c in childs:
        get_word_counts(c, d)

In [12]:
def build_prob_lexicon(sentences, tag_set):
    """Builds a probabilistic lexicon using the tagged sentences"""
    
    lexicon_dict = dict()
    
    for s in sentences:
        parentheses_stack = []
        tags_stack = []
        s = parse(s)
        get_word_counts(s, lexicon_dict)
    
    for _, d in lexicon_dict.items():
        tot_count = max(sum(d.values()), 1)
        for k2, v in d.items():
            tag_set.add(k2)
            d[k2] = v/tot_count
    
    return lexicon_dict

In [13]:
lexicon = build_prob_lexicon(data, tag_set)
print(lexicon)

{'gutenberg': {'NPP': 1.0}, 'cette': {'DET': 1.0}, 'exposition': {'NC': 1.0}, 'nous': {'CLO-A_OBJ': 0.07734806629834254, 'CLS-SUJ': 0.856353591160221, 'CLR': 0.04419889502762431, 'PRO': 0.011049723756906077, 'CLO-OBJ': 0.0055248618784530384, 'CLR-A_OBJ': 0.0055248618784530384}, 'apprend': {'V': 1.0}, 'que': {'CS': 0.7675, 'PROREL': 0.1725, 'PROREL-OBJ': 0.0025, 'ADV': 0.045, 'PROWH': 0.0125}, 'dès': {'P': 1.0}, 'le': {'DET': 0.9710610932475884, 'CLO-OBJ': 0.02652733118971061, 'CLO-ATS': 0.001607717041800643, 'NPP': 0.0008038585209003215}, 'xiie': {'ADJ': 1.0}, 'siècle': {'NC': 1.0}, ',': {'PONCT': 1.0}, 'à': {'P': 1.0}, 'dammarie-sur-saulx': {'NPP': 1.0}, 'entre': {'P': 1.0}, 'autres': {'ADJ': 0.9777777777777777, 'PRO': 0.022222222222222223}, 'sites': {'NC': 1.0}, 'une': {'DET': 0.9883381924198251, 'PRO': 0.011661807580174927}, 'industrie': {'NC': 1.0}, 'métallurgique': {'ADJ': 1.0}, 'existait': {'V': 1.0}, '.': {'PONCT': 1.0}, 'à_peu_près': {'ADV': 1.0}, 'au': {'P+D': 1.0}, 'même': {'

In [14]:
def get_tag_counts(s, d):
    """Adds the tag count to the tag dictionary"""
    tag_name = s[0] # Do not remove hyphens
    if len(s) == 2 and isinstance(s[1], str):
        d[tag_name] = d.get(tag_name, 0) + 1 
        return
    childs = s[1:]
    for c in childs:
        get_tag_counts(c, d)

In [15]:
def build_tag_frequencies(sentences):
    """Builds a tag frequency list"""
    
    tag_dict = dict()
    
    for s in sentences:
        parentheses_stack = []
        tags_stack = []
        s = parse(s)
        get_tag_counts(s, tag_dict)
        
    tot_count = max(sum(tag_dict.values()), 1)
    for k1, v in tag_dict.items():
        tag_dict[k1] = v/tot_count
    
    return tag_dict

In [16]:
tag_freq = build_tag_frequencies(data)
print(tag_freq)

{'NPP': 0.05283570512246785, 'DET': 0.13707151167994272, 'NC': 0.22370894119752976, 'CLO-A_OBJ': 0.001238103762045407, 'V': 0.05587875533279633, 'CS': 0.010516423520988097, 'P': 0.13135833407917896, 'ADJ': 0.06951281362809153, 'PONCT': 0.11744085444076494, 'ADV': 0.0369193591694263, 'P+D': 0.027297950416181868, 'VPP': 0.03736686655329813, 'CC': 0.02470240758972523, 'PROREL': 0.008741310898296488, 'VINF': 0.02091351173961037, 'CLS-SUJ': 0.01725886810465706, 'VPR': 0.0055043408216235564, 'CLO-OBJ': 0.0017303618843044243, 'ADVWH': 0.0004325904710761061, 'ADVWH-MOD': 5.9667651182911185e-05, 'CLR': 0.0036248098093618546, 'CLO-MOD': 0.00013425221516155016, 'PREF': 0.0011336853724753125, 'ADV-P_OBJ': 0.0003132551687102837, 'CLO-DE_OBJ': 0.0003580059070974671, 'PROREL-OBJ': 1.4916912795727796e-05, 'PRO': 0.006339687938184313, 'CLR-OBJ': 0.0004176735582803783, 'VS': 0.001312688326024046, 'CLR-AFF.DEMSUJ': 0.0008353471165607566, 'I': 4.475073838718339e-05, 'ADV-MOD': 0.00013425221516155016, 'CLO

In [17]:
def remove_unit(cfg, A, d, k, tag, tag_set):
    """Replaces the Unit rules by Multiple Rules by chaining"""
    B = tag
    new_entry = cfg.get(B)
    if new_entry is None:
        return False
    cfg[A].update(copy.deepcopy(new_entry))
    del cfg[B]
    del d[k]
    return True

In [18]:
def remove_multi(cfg, A, k, idx, tag_set):
    """Replaces the Multiple Rules with chains of Double or Unit Rules"""
    tags = k.split(',')
    n = len(tags)
    d = cfg[A]
    
    new_tag = A + str(idx) + str(0)
    new_key = ','.join([tags[0], new_tag])
    d[new_key] = copy.deepcopy(d[k])
    del d[k]
    
    for i in range(1, n-1): 
        tag_set.add(new_tag)
        current_key = new_tag
        new_tag = A + str(idx) + str(i)
        cfg[current_key] =  {','.join([tags[i], new_tag]): 1.0}
    
    tag_set.add(new_tag)
    cfg[new_tag] = {tags[-1] : 1.0}

In [19]:
def chomskyfy(context_free_grammar, tag_set):
    """Binarisation of a CFG. Starts with removing Unit and then Multi Rules"""
    cfg = copy.deepcopy(context_free_grammar)
    
    unit_exist = True
    while unit_exist:
        unit_exist = False
        copy_cfg = copy.deepcopy(cfg) # Avoid Dictionnary size change during iteration
        for k1, d1 in copy_cfg.items():
            copy_d1 = copy.deepcopy(d1) # Avoid Dictionnary size change during iteration
            for k2, prob in copy_d1.items():
                tags = k2.split(',')[1:]
                if len(tags) == 1:
                    # Check that we have unit rules that we can remove
                    unit_exist = unit_exist or remove_unit(cfg, k1, d1, k2, tags[0], tag_set)
    multi_exist = True
    while multi_exist:
        multi_exist = False
        copy_cfg = copy.deepcopy(cfg) # Avoid Dictionnary size change during iteration
        for k1, d1 in copy_cfg.items():
            copy_d1 = copy.deepcopy(d1) # Avoid Dictionnary size change during iteration
            for i, (k2, prob) in enumerate(copy_d1.items()):
                tags = k2.split(',')[1:]
                if len(tags) > 2:
                    multi_exist = True 
                    remove_multi(cfg, k1, k2, i, tag_set)
    return cfg

In [20]:
binary_pcfg = chomskyfy(pcfg, tag_set)
print(binary_pcfg)

{'SENT': {'NP': 0.11519845111326234, 'VN,Ssub,PONCT': 0.012262020006453695, 'ADVWH,NP,PONCT': 0.00032268473701193933, 'NP,PONCT': 0.04033559212649242, 'PP,PONCT': 0.002258793159083575, 'NP,VN,NP': 0.001379310344827586, 'NP,PONCT,Sint': 0.0006453694740238787, 'VN,NP,PONCT': 0.010648596321393998, 'VN,VPinf,PONCT': 0.015166182639561149, 'AP,PONCT': 0.0012907389480477573, 'COORD,PONCT': 0.011293965795417877, 'NP,PP': 0.0006453694740238787, 'ADV,PONCT': 0.00032268473701193933, 'NP,PP,PONCT': 0.0016134236850596966, 'VN,PP,PONCT': 0.005808325266214908, 'VPinf,PONCT': 0.006131010003226848, 'COORD': 0.0012907389480477573, 'Ssub,PONCT': 0.002258793159083575, 'PP,PONCT,NP': 0.00032268473701193933, 'NP,VN,PONCT': 0.006776379477250726, 'NP,Ssub,PONCT': 0.00032268473701193933, 'NP,VPinf': 0.00032268473701193933, 'NP,VN,ADV': 0.00032268473701193933, 'NP,PONCT,PP': 0.00032268473701193933, 'PONCT': 0.00032268473701193933, 'NP,PONCT,NP': 0.003872216844143272, 'NP,COORD,PONCT': 0.00032268473701193933, 'V

In [21]:
### CODE FROM THE POLYGLOT TUTORIAL ###

from operator import itemgetter
import re
import pickle

words, embeddings = pickle.load(open('polyglot-fr.pkl', 'rb'), encoding='latin1')

# Map words to indices and vice versa
lexicon_words = lexicon.keys()
intersection_words = list()
intersection_embeddings = list()

for i, w in enumerate(words):
    if w in lexicon:
        intersection_words.append(w)
        
word_id = {w:i for (i, w) in enumerate(words)}
id_word = dict(enumerate(embeddings))

intersection_embeddings = np.array(intersection_embeddings)
intersection_words = np.array(intersection_words)


# Normalize digits by replacing them with #
DIGITS = re.compile("[0-9]", re.UNICODE)


def case_normalizer(word, dictionary):
    """ In case the word is not available in the vocabulary,
     we can try multiple case normalizing procedure.
     We consider the best substitute to be the one with the lowest index,
     which is equivalent to the most frequent alternative."""
    w = word
    lower = (dictionary.get(w.lower(), 1e12), w.lower())
    upper = (dictionary.get(w.upper(), 1e12), w.upper())
    title = (dictionary.get(w.title(), 1e12), w.title())
    results = [lower, upper, title]
    results.sort()
    index, w = results[0]
    if index != 1e12:
        return w
    return word


def normalize(word, word_id):
    """ Find the closest alternative in case the word is OOV."""
    if not word in word_id:
        word = DIGITS.sub("#", word)
    if not word in word_id:
        word = case_normalizer(word, word_id)

    if not word in word_id:
        return None
    return word


def l2_nearest(embeddings, words, word_index, k):
    """Sorts words according to their Euclidean distance.
       To use cosine distance, embeddings has to be normalized so that their l2 norm is 1."""

    e1 = embeddings[word_index]
    distances = []
    for w2 in words:
        e2 = embeddings[word_id[w2]]
        distances.append(e1.dot(e2) / (np.linalg.norm(e1) * np.linalg.norm(e2)))
    sorted_distances = sorted(enumerate(distances), key=itemgetter(1))
    return zip(*sorted_distances[:k])


def knn(word, words, embeddings, word_id, id_word, k=5):
    word = normalize(word, word_id)
    if not word:
        print("OOV word")
        return None, None
    word_index = word_id[word]
    indices, distances = l2_nearest(embeddings, words, word_index, k)
    neighbors = [id_word[idx] for idx in indices]
    
    return neighbors, distances

In [22]:
def levenstein_dist(w1, w2):
    """Computes the levenstein distance between w1 and w2"""
    
    m, n  = len(w1),len(w2)
    
    result = np.zeros((m, n))
    
    for i in range(m):
        result[i, 0] = i
    for i in range(n):
        result[0, i] = i
        
    for i, c1 in enumerate(w1):
        for j, c2 in enumerate(w2):
            if c1 == c2:
                result[i, j] = min(result[i-1, j] + 1,
                                   result[i, j-1] + 1,
                                   result[i-1, j-1],
                                  )
            else:
                result[i, j] = min(result[i-1, j] + 1,
                                   result[i, j-1] + 1,
                                   result[i-1, j-1] + 1,
                                  )
    return result[m-1, n-1]

In [23]:
def handle_oov(word, k=10):
    
    lev_dist = []
    for w2 in intersection_words:
        lev_dist.append(levenstein_dist(word, w2))
    min_indices = np.argpartition(lev_dist, k)[:k]
    close_words = intersection_words[min_indices]
    
    neighbors, distances = knn(word, close_words, embeddings, word_id, id_word)
    
    if neighbors is None:
        return None
    
    return close_words[np.argmin(distances)]

In [24]:
tag_list = list(tag_set)
tag_to_idx = {tag: idx for (idx, tag) in enumerate(tag_list)}

In [30]:
def compute_cyk(sentence, use_oov=False):
    """Computes the CYK matrix for a sentence"""
    
    sentence = sentence.lower().split(' ')
    
    n = len(sentence)
    
    cyk_matrix = np.zeros((n, n, len(tag_list), 3)) #3 is for the parent to 2 child relation
    prob_matrix = np.zeros((n, n, len(tag_list)))
    
    # Unit words handling
    for i, w in enumerate(sentence):
        token_to_tag = w

        if not w in intersection_words:
            if use_oov: 
                print(w + " is an OOV")
            token_to_tag = handle_oov(w)
            if use_oov:
                if token_to_tag is None:
                    print("No closest token found\n")
                else:
                    print("Closest token found :", token_to_tag, "\n")
        
        if token_to_tag is None:
            for token, prob in tag_freq.items():
                idx = tag_to_idx.get(tag)
                if not tag is None:  # avoid the case where tag appearing in lexicon but not in grammar rules
                    prob_matrix[0, i, idx] = prob
            
        else:
            max_tag, max_prob = -1, -1
            for tag, prob in lexicon[token_to_tag].items():
                idx = tag_to_idx.get(tag)
                if not idx is None:
                    prob_matrix[0, i , idx] = prob
        
    
    # Strings of length 2 and more 
    for l in range(1, n):
        print(l)
        for start in range(n - l):
            for parent_tag in binary_pcfg.keys():
                for row in range(start, start + l):
                    for k1 in binary_pcfg[parent_tag].keys():
                        left_child_tag = k1.split(',')[0]
                        
                        left_idx = tag_to_idx[left_child_tag]
                        parent_idx = tag_to_idx[parent_tag]
                        prob_left = prob_matrix[row - start ,start, left_idx]
                        
                        if prob_left > prob_matrix[l, start, parent_idx]:
                            
                            for k2, prob in binary_pcfg[parent_tag].items():
                                if left_child_tag != k2.split(',')[0]:
                                    continue #Skip non-possible grammar  
            
                                if len(k2.split(',')) == 2:
                                    right_child_tag = k2.split(',')[1]
                                    right_idx = tag_to_idx[right_child_tag]
                                    prob_right = prob_matrix[l - (row - start) - 1, row + 1, right_idx]

                                    full_prob = prob * prob_left * prob_right
                                
                                else:
                                    full_prob = prob_left * prob 
                                
                                #print(full_prob, prob_matrix[l, start, parent_idx])
                        
                                if full_prob > prob_matrix[l, start, parent_idx]:
                                    prob_matrix[l, start, parent_idx] = full_prob
                                    cyk_matrix[l, start, parent_idx, 0] = parent_idx
                                    cyk_matrix[l, start, parent_idx, 1] = left_idx
                                    if len(k2.split(',')) == 2:
                                        cyk_matrix[l, start, parent_idx, 2] = right_idx
    print(tag_to_idx['SENT'])
    print(cyk_matrix.shape)
    result = cyk_matrix[n-1, 0, tag_to_idx['SENT']]
    print(prob_matrix[n-1, 0, tag_to_idx['SENT']])
    print(result)
    print(tag_list[int(result[0])])
    print(tag_list[int(result[1])])
    print(tag_list[int(result[2])])
    
    

In [31]:
compute_cyk(untagged_sentences[14], use_oov=True)

1
2
3
6917
(4, 4, 10890, 3)
0.001379310344827586
[6917. 1694.    0.]
SENT
ADVWH
CC,VN,VPinf,PP


In [29]:
print(untagged_sentences[14])

Pourquoi ce thème ?
