In [1]:
import nltk
from nltk.corpus import wordnet as wn
import string
import itertools
from nltk.corpus import wordnet_ic
import codecs
import pickle
import Stemmer
import datetime

In [2]:
def load_MSRP_corpus(path, ftype):
    """Returns the tuples in the MSRP Corus, which has the following format
       class   #1_ID   #2_ID   #1_String   #2_String
    """
    if ftype == "train":
        fpath = (path+"msr_paraphrase_train.txt"
                 if path[-1] == '/' or path[-1] == '\\'
                 else path+"/msr_paraphrase_train.txt")
    elif ftype == "test":
        fpath = (path+"msr_paraphrase_test.txt"
                 if path[-1] == '/' or path[-1] == '\\'
                 else path+"/msr_paraphrase_test.txt")
    pairs = []
    with codecs.open(fpath, 'r', encoding='utf-8') as fid:
        for line in list(fid)[1:]:
            pairs.append(line.split('\t'))
    return pairs

In [3]:
def word_tokenizer(text, stemmer, flat_output=True):
    """Returns the text divided by sentences and tokenized.
       If flat_output is True, returns 
       The format of the results is (token, lemma, stem, tag)"""
    # Methods for sentence splitting, tokenization and lemmatization
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    word_detector = nltk.TreebankWordTokenizer()
    wnl = nltk.stem.WordNetLemmatizer()
    tags_mapping = {"NN":wn.NOUN, "VB":wn.VERB, "JJ":wn.ADJ, "RB":wn.ADV}
    
    # Text processing
    output = []
    for sent in sent_detector.tokenize(text.lower()):
        tokens = word_detector.tokenize(sent)
        tmp = []
        for token, tag in nltk.pos_tag(tokens):
            wn_pos = tags_mapping.get(tag[:2], None)
            if wn_pos:
                tmp.append((token, wnl.lemmatize(token, pos=wn_pos), stemmer.stemWord(token), tag))
            else:
                tmp.append((token, token, stemmer.stemWord(token), tag))
        output.append(tmp)
    
    if flat_output:
        return list(itertools.chain(*output))
    return output

#### Parsing training instances
We parse both sentences in each instance of the MSPR training dataset in the format (token, lemma, POS_tag). It might be divided in sentences as well depending on the flag **flat_output** being false.

tags_mapping maps Treebank POS tags to WordNet POS tags.

In [4]:
stemmer = Stemmer.Stemmer('english')

In [5]:
raw_data = load_MSRP_corpus(path="D:/Corpus/MSPC", ftype="train")

parsed_texts = {}
train_pairs = []
train_y = []
for _class, id1, id2, text1, text2 in raw_data:
    if id1 not in parsed_texts:
        parsed_texts[int(id1)] = word_tokenizer(text1, stemmer=stemmer, flat_output=True)
    if id2 not in parsed_texts:
        parsed_texts[int(id2)] = word_tokenizer(text2, stemmer=stemmer, flat_output=True)
    train_pairs.append((int(id1),int(id2)))
    train_y.append(int(_class))

#### Parsing test instances
We parse both sentences in each instance of the MSPR test dataset in the format (token, lemma, POS_tag). It might be divided in sentences as well depending on the flag **flat_output** being false.

In [6]:
raw_data = load_MSRP_corpus(path="D:/Corpus/MSPC", ftype="test")

test_pairs = []
test_y = []
for _class, id1, id2, text1, text2 in raw_data:
    if id1 not in parsed_texts:
        parsed_texts[int(id1)] = word_tokenizer(text1, stemmer=stemmer, flat_output=True)
    if id2 not in parsed_texts:
        parsed_texts[int(id2)] = word_tokenizer(text2, stemmer=stemmer, flat_output=True)
    test_pairs.append((int(id1),int(id2)))
    test_y.append(int(_class))

#### Generating vocabulary
The vocabulary is generated taking into account the lemma and wordnet tag (if possible).

Here, we can perform several experiments like taking the lemma and regular POS tag, only taking the tokens, etc.

**Notes:**
- The code depends on the format used in parsed_texts, assuming **flat_output=True**

In [7]:
tags_mapping = {"NN":wn.NOUN, "VB":wn.VERB, "JJ":wn.ADJ, "RB":wn.ADV}

# itertools.chain(*parsed_texts.values()) converts all documents into one single list
vocab_tokens = nltk.FreqDist([token
                              for token, lemma, stem, tag
                              in itertools.chain(*parsed_texts.values())])

vocab_lemmas = nltk.FreqDist([lemma
                              for token, lemma, stem, tag
                              in itertools.chain(*parsed_texts.values())])

vocab_stems = nltk.FreqDist([stem
                             for token, lemma, stem, tag
                             in itertools.chain(*parsed_texts.values())])

vocab_lemmapos = nltk.FreqDist([(lemma, tags_mapping.get(tag[:2], tag))
                                for token, lemma, stem, tag
                                in itertools.chain(*parsed_texts.values())])

print(len(vocab_tokens), "tokens in the vocabulary of tokens")
print(len(vocab_lemmas), "tokens in the vocabulary of lemmas")
print(len(vocab_stems), "tokens in the vocabulary of stems")
print(len(vocab_lemmapos), "tokens in the vocabulary of lemmas+pos")

18003 tokens in the vocabulary of tokens
15224 tokens in the vocabulary of lemmas
13801 tokens in the vocabulary of stems
18168 tokens in the vocabulary of lemmas+pos


In [8]:
index2token = list(vocab_tokens.keys())
token2index = dict([(token,i) for i, token in enumerate(index2token)])

index2lemma = list(vocab_lemmas.keys())
lemma2index = dict([(lemma,i) for i, lemma in enumerate(index2lemma)])

index2stem = list(vocab_stems.keys())
stem2index = dict([(stem,i) for i, stem in enumerate(index2stem)])

index2lemmapos = list(vocab_lemmapos.keys())
lemmapos2index = dict([(lemmapos,i) for i, lemmapos in enumerate(index2lemmapos)])

#### Serializing all MSRPC and needed data structures
We serialize the Microsoft Research Paraphrase Corpus with the format:

    [parsed_texts, train_pairs, train_y, test_pairs, test_y]
   
The goal of serializing all of these is to make experiments with the parsed corpus used to compute the WordNet similarity measures.

<parsed_texts> contains all the documents from the MSRPC tokenized. It is a dictionary where the keys are the id of the documents and the values are lists of the following tuples (token, lemma, POS tag)

We also serialize the vocabularies for different features (tokens, lemmas, stems, and lemmas+pos) and the data structure needed to convert from indexes to the given feature and viceversa.

    [vocab, token_to_index, index_to_token]

In [100]:
date_obj = datetime.date.today()
date_str = "{:04d}".format(date_obj.year) + "{:02d}".format(date_obj.month) + "{:02d}".format(date_obj.day)

In [101]:
pickle.dump([parsed_texts, train_pairs, train_y, test_pairs, test_y],
            open("msrpc_parsed_"+date_str+".pickle", "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)

In [102]:
pickle.dump([vocab_tokens, token2index, index2token],
            open("tokens_data_"+date_str+".pickle", "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)

pickle.dump([vocab_lemmas, lemma2index, index2lemma],
            open("lemmas_data_"+date_str+".pickle", "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)

pickle.dump([vocab_stems, stem2index, index2stem],
            open("stems_data_"+date_str+".pickle", "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)

pickle.dump([vocab_lemmapos, lemmapos2index, index2lemmapos],
            open("lemmapos_data_"+date_str+".pickle", "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)

#### Computing some Information Content (IC)

In [9]:
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
bnc_ic_2000 = wordnet_ic.ic('ic-bnc.dat')
bnc_ic_2007 = wordnet_ic.ic('ic-bnc-2007.dat')

**Note:** This code is not in use!

***Warning*** Computing BNC Information Content (IC) takes hours!!!

There is a version of the IC for the BNC corpus. However it was computed using the 2000 version of BNC. The corpus we are computing here is the version of 2007. The similiarty measures using these two IC are different.

reader_bnc = nltk.corpus.reader.BNCCorpusReader(root='D:/Corpus/2554/2554/download/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
bnc_ic = wn.ic(reader_bnc, False, 0.0)

def is_root(synset_x):
    if synset_x.root_hypernyms()[0] == synset_x:
        return True
    return False

def generate_ic_file(IC, output_filename):
    """Dump in output_filename the IC counts.
    The expected format of IC is a dict 
    {'v':defaultdict, 'n':defaultdict, 'a':defaultdict, 'r':defaultdict}"""
    with codecs.open(output_filename, 'w', encoding='utf-8') as fid:
        # Hash code of WordNet 3.0
        fid.write("wnver::eOS9lXC6GvMWznF1wkZofDdtbBU"+"\n")
        
        # We only stored nouns and verbs because those are the only POS tags
        # supported by wordnet.ic() function
        for tag_type in ['v', 'n']:#IC:
            for key, value in IC[tag_type].items():
                if key != 0:
                    synset_x = wn.of2ss(of="{:08d}".format(key)+tag_type)
                    if is_root(synset_x):
                        fid.write(str(key)+tag_type+" "+str(value)+" ROOT\n")
                    else:
                        fid.write(str(key)+tag_type+" "+str(value)+"\n")
    print("Done")

generate_ic_file(bnc_ic, "C:/Users/MiguelAngel/AppData/Roaming/nltk_data/corpora/wordnet_ic/ic-bnc-2007.dat")

#### Examples applying WordNet similarity measures and using different IC

In [10]:
syns = wn.synsets(lemma="car", pos='n')
print(syns)

[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]


In [11]:
syns[0]

Synset('car.n.01')

In [12]:
def nan_default(result):
    if result:
        return result
    return 0.0

In [13]:
print("{:40} {:.2f}".format("Path Similarity:", nan_default(wn.path_similarity(syns[0], syns[1]))))

print("{:40} {:.2f}".format("Leacock-Chodorow Similarity:", nan_default(wn.lch_similarity(syns[0], syns[1]))))

print("{:40} {:.2f}".format("Wu-Palmer Similarity:", nan_default(wn.wup_similarity(syns[0], syns[1]))))

print("{:40} {:.2f}".format("Resnik Similarity (brown_ic):", nan_default(wn.res_similarity(syns[0], syns[1], brown_ic))))
print("{:40} {:.2f}".format("Resnik Similarity (semcor_ic):", nan_default(wn.res_similarity(syns[0], syns[1], semcor_ic))))
print("{:40} {:.2f}".format("Resnik Similarity (bnc_ic):", nan_default(wn.res_similarity(syns[0], syns[1], bnc_ic_2007))))

print("{:40} {:.2f}".format("Jiang-Conrath Similarity (brown_ic):", nan_default(wn.jcn_similarity(syns[0], syns[1], brown_ic))))
print("{:40} {:.2f}".format("Jiang-Conrath Similarity (semcor_ic):", nan_default(wn.jcn_similarity(syns[0], syns[1], semcor_ic))))
print("{:40} {:.2f}".format("Jiang-Conrath Similarity (bnc_ic):", nan_default(wn.jcn_similarity(syns[0], syns[1], bnc_ic_2007))))

print("{:40} {:.2f}".format("Lin Similarity (brown_ic):", nan_default(wn.lin_similarity(syns[0], syns[1], brown_ic))))
print("{:40} {:.2f}".format("Lin Similarity (semcor_ic):", nan_default(wn.lin_similarity(syns[0], syns[1], semcor_ic))))
print("{:40} {:.2f}".format("Lin Similarity (bnc_ic):", nan_default(wn.lin_similarity(syns[0], syns[1], bnc_ic_2007))))

Path Similarity:                         0.20
Leacock-Chodorow Similarity:             2.03
Wu-Palmer Similarity:                    0.73
Resnik Similarity (brown_ic):            6.45
Resnik Similarity (semcor_ic):           6.31
Resnik Similarity (bnc_ic):              6.39
Jiang-Conrath Similarity (brown_ic):     0.36
Jiang-Conrath Similarity (semcor_ic):    0.27
Jiang-Conrath Similarity (bnc_ic):       0.36
Lin Similarity (brown_ic):               0.82
Lin Similarity (semcor_ic):              0.77
Lin Similarity (bnc_ic):                 0.82


### Computing all the similarities needed in the MSRPC
We only care about the following POS tags: NN, VB, RB, JJ. Note: The WordNet authors divide the adjectives in two types: ('a' - Adjevtive) and ('s' - Adjective Satellite)

The dictionary *tags_mapping** maps Treebank POS to WordNet POS.

We compute the similarities for all words in phrase1 to all words in phrase2 that have the same POS tag. Also, for each pair of words we compute the similarity between all the words synsents. During testing we can decide if we use the first synset or the max similarity between all synsets.

In [93]:
def wordnet_similarity(s1, s2, metric="Path", ic=None):
    # Computing the Path similarity
    # This measure ranges between 0 and 1, not normalization needed
    if metric == "path":
        if s1.pos() == s2.pos():
            # path_similarity returns None if no connecting path could be found
            # between s1 and s2. We convert None to 0.0
            return nan_default(wn.path_similarity(s1, s2))
        else:
            return 0.0
    
    # Computing the Leacock-Chodorow similarity
    # This measure range is unknown, normalization needed!!!**********
    if metric == "lch":
        if s1.pos() == s2.pos():
            # path_similarity returns None if no connecting path could be found
            # between s1 and s2. We convert None to 0.0
            return nan_default(wn.lch_similarity(s1, s2))
        else:
            return 0.0
    
    # Computing the Wu-Palmer similarity
    # This measure range is unknown, normalization needed!!!**********
    # path_similarity returns None if no connecting path could be found
    # between s1 and s2. We convert None to 0.0
    if metric == "wup":
        if s1.pos() == s2.pos():
            return nan_default(wn.wup_similarity(s1, s2))
        else:
            return 0.0
    
    # Computing the Resnik similarity
    # This measure range is unknown, normalization needed!!!**********
    # Only comparing verbs and nouns because those are the only keys in IC
    if metric == "res":
        if s1.pos() == s2.pos() and s1.pos() in ['v', 'n']:
            return wn.res_similarity(s1, s2, ic)
        else:
            return 0.0
    
    # Computing the Jiang-Conrath similarity
    # This measure range is unknown, normalization needed!!!**********
    # Only comparing verbs and nouns because those are the only keys in IC
    if metric == "jcn":
        if s1.pos() == s2.pos() and s1.pos() in ['v', 'n']:
            return wn.jcn_similarity(s1, s2, ic)
        else:
            return 0.0
    
    # Computing the Lin similarity
    # This measure range is unknown, normalization needed!!!**********
    # Only comparing verbs and nouns because those are the only keys in IC
    if metric == "lin":
        try:
            if s1 == s2:
                return 1.0
            if s1.pos() == s2.pos() and s1.pos() in ['v', 'n']:
                return wn.lin_similarity(s1, s2, ic)
            else:
                return 0.0
        except:
            print("Synsets causing the exception")
            print(s1, s2)
            raise
    
    print("No available metric selected. Raising an error.")
    raise
    return None

In [94]:
def compute_msrpc_wordnet_sims_tokens(pairs, vocab, token2index, tags_mapping,
                                      sims_first_synset, sims_all_synsets, metric, ic):
    """Compute the WordNet <metric> similarity for all the pairs of tokens needed in the MSRPCorpus
    using only the given tokens to extract the wordnet synsets"""
    count_first = 0
    count_max = 0

    # For each pair of phrase in training dataset
    for id1, id2 in pairs:
        # For each token in first phrase
        for token1, lemma1, stem1, tag1 in parsed_texts[id1]:
            token1_id = token2index.get(token1, None)
            if token1_id == None:
                print("Error, token1 not in vocabulary", token1)
                raise
            # Extract the synsets from first phrase tokens
            syns_tk1 = wn.synsets(lemma=token1)
            # Compare to each token in phrase 2
            for token2, lemma2, stem2, tag2 in parsed_texts[id2]:
                # index of token2 in the vocabulary
                token2_id = token2index.get(token2, None)
                if token2_id == None:
                    print("Error, token2 not in vocabulary")
                    raise
                # Extract the synsets from second prhase tokens
                syns_tk2 = wn.synsets(lemma=token2)

                # Validating we have not computed this similarity before
                # *****************
                # *** IMPORTANT *** We need to apply the same method when retrieving similarity scores***
                # *****************
                sim_key = (token1_id, token2_id) if token1_id > token2_id else (token2_id, token1_id)
                # Must be different tokens
                if sim_key[0] != sim_key[1] and len(syns_tk1)*len(syns_tk2) != 0 and sim_key not in sims_all_synsets:
                    all_sims = []
                    #sims_first_synset[sim_key] = 0.0
                    #sims_all_synsets[sim_key] = 0.0
                    for s1 in syns_tk1:
                        for s2 in syns_tk2:
                            # The similarity must be different of None and Zero
                            res = wordnet_similarity(s1, s2, metric=metric, ic=ic)
                            if res != 0.0:
                                all_sims.append(res)
                    if len(all_sims) > 0:
                        sims_first_synset[sim_key] = all_sims[0]
                        sims_all_synsets[sim_key] = max(all_sims)
                        count_max += len(syns_tk1)*len(syns_tk2)
                        count_first += 0 if len(syns_tk1)*len(syns_tk2) == 0 else 1
        # For debugging. Stopping early.
        #if count_first > 1:
        #    break
        # Finish debugging
    print(count_first, "new token pairs computed.")
    print(count_max, "Wordnet similarity computed.")
    return None

In [95]:
def compute_msrpc_wordnet_sims_lemmas(pairs, vocab, lemma2index, tags_mapping,
                                      sims_first_synset, sims_all_synsets, metric, ic):
    """Compute the WordNet <metric> similarity for all the pairs of tokens needed in the MSRPCorpus
    using only the lemmas (NO POS tags) of the given tokens to extract the wordnet synsets"""
    count_first = 0
    count_max = 0

    # For each pair of phrase in training dataset
    for id1, id2 in pairs:
        # For each token in first phrase
        for token1, lemma1, stem1, tag1 in parsed_texts[id1]:
            token1_id = lemma2index.get(lemma1, None)
            if token1_id == None:
                print("Error, lemma1 not in vocabulary", lemma1)
                raise
            # Extract the synsets from first phrase tokens
            syns_tk1 = wn.synsets(lemma=lemma1)
            # Compare to each token in phrase 2
            for token2, lemma2, stem2, tag2 in parsed_texts[id2]:
                # index of token2 in the vocabulary
                token2_id = lemma2index.get(lemma2, None)
                if token2_id == None:
                    print("Error, lemma2 not in vocabulary")
                    raise
                # Extract the synsets from second prhase tokens
                syns_tk2 = wn.synsets(lemma=lemma2)

                # Validating we have not computed this similarity before
                # *****************
                # *** IMPORTANT *** We need to apply the same method when retrieving similarity scores***
                # *****************
                sim_key = (token1_id, token2_id) if token1_id > token2_id else (token2_id, token1_id)
                # Must be different tokens
                if sim_key[0] != sim_key[1] and len(syns_tk1)*len(syns_tk2) != 0 and sim_key not in sims_all_synsets:
                    all_sims = []
                    #sims_first_synset[sim_key] = 0.0
                    #sims_all_synsets[sim_key] = 0.0
                    for s1 in syns_tk1:
                        for s2 in syns_tk2:
                            # The similarity must be different of None and Zero
                            res = wordnet_similarity(s1, s2, metric=metric, ic=ic)
                            if res != 0.0:
                                all_sims.append(res)
                    if len(all_sims) > 0:
                        sims_first_synset[sim_key] = all_sims[0]
                        sims_all_synsets[sim_key] = max(all_sims)
                        count_max += len(syns_tk1)*len(syns_tk2)
                        count_first += 0 if len(syns_tk1)*len(syns_tk2) == 0 else 1
        # For debugging. Stopping early.
        #if count_first > 1:
        #    break
        # Finish debugging
    print(count_first, "new token pairs computed.")
    print(count_max, "Wordnet similarity computed.")
    return None

In [96]:
def compute_msrpc_wordnet_sims_lemmaspos(pairs, vocab, lemmapos2index, tags_mapping,
                                          sims_first_synset, sims_all_synsets, metric, ic):
    """Compute the WordNet <metric> similarity for all the pairs of tokens needed in the MSRPCorpus
    using the lemmas and POS tags of the given tokens to extract the wordnet synsets"""
    count_first = 0
    count_max = 0

    # For each pair of phrase in training dataset
    for id1, id2 in pairs:
        # For each token in first phrase
        for token1, lemma1, stem1, tag1 in parsed_texts[id1]:
            # Only look at WordNet POS tags
            wn_tag1 = tags_mapping.get(tag1[:2], None)
            if wn_tag1:
                # index of token1 in the vocabulary
                token1_id = lemmapos2index.get((lemma1, wn_tag1), None)
                if token1_id == None:
                    print("Error, lemma1 not in vocabulary", lemma1)
                    raise
                # Extract the synsets from first phrase tokens
                syns_tk1 = wn.synsets(lemma=lemma1, pos=wn_tag1)
                # Compare to each token in phrase 2
                for token2, lemma2, stem2, tag2 in parsed_texts[id2]:
                    # Only look at WordNet POS tags
                    wn_tag2 = tags_mapping.get(tag2[:2], None)
                    # Compute similarity of those tokens with the same POS tag
                    if wn_tag2 and wn_tag1 == wn_tag2:
                        # index of token2 in the vocabulary
                        token2_id = lemmapos2index.get((lemma2, wn_tag2), None)
                        if token2_id == None:
                            print("Error, lemma2 not in vocabulary")
                            raise
                        # Extract the synsets from second prhase tokens
                        syns_tk2 = wn.synsets(lemma=lemma2, pos=wn_tag2)

                        # Validating we have not computed this similarity before
                        # *****************
                        # *** IMPORTANT *** We need to apply the same method when retrieving similarity scores***
                        # *****************
                        sim_key = (token1_id, token2_id) if token1_id > token2_id else (token2_id, token1_id)
                        # Must be different tokens
                        if sim_key[0] != sim_key[1] and len(syns_tk1)*len(syns_tk2) != 0 and sim_key not in sims_all_synsets:
                            all_sims = []
                            #sims_first_synset[sim_key] = 0.0
                            #sims_all_synsets[sim_key] = 0.0
                            for s1 in syns_tk1:
                                for s2 in syns_tk2:
                                    # The similarity must be different of None and Zero
                                    res = wordnet_similarity(s1, s2, metric=metric, ic=ic)
                                    if res != 0.0:
                                        all_sims.append(res)
                            if len(all_sims) > 0:
                                sims_first_synset[sim_key] = all_sims[0]
                                sims_all_synsets[sim_key] = max(all_sims)
                                count_max += len(syns_tk1)*len(syns_tk2)
                                count_first += 0 if len(syns_tk1)*len(syns_tk2) == 0 else 1
        # For debugging. Stopping early.
        #if count_first > 1:
        #    break
        # Finish debugging
    print(count_first, "new token pairs computed.")
    print(count_max, "Wordnet similarity computed.")
    return None

### WordNet similarity
path, lch, wup, res, jcn, lin

In [99]:
for ic_data, ic_name in zip([bnc_ic_2000, semcor_ic, brown_ic], ['bnc_ic_2000', 'semcor_ic', 'brown_ic']):
    for metric in ['lin', 'jcn', 'res']:#['lin', 'jcn', 'res', 'path', 'lch', 'wup']:
        tags_mapping = {"NN":wn.NOUN, "VB":wn.VERB, "JJ":wn.ADJ, "RB":wn.ADV}
        date_obj = datetime.date.today()
        date_str = "{:04d}".format(date_obj.year) + "{:02d}".format(date_obj.month) + "{:02d}".format(date_obj.day)

        print("METRIC:", metric)
        #***********************************************
        print("Using only tokens")
        print("Training dataset")
        sims_first_synset = {}
        sims_all_synsets = {}
        compute_msrpc_wordnet_sims_tokens(train_pairs, vocab_tokens, token2index, tags_mapping,
                                           sims_first_synset, sims_all_synsets, metric, ic_data)#bnc_ic_2007)
        print("Test dataset")
        compute_msrpc_wordnet_sims_tokens(test_pairs, vocab_tokens, token2index, tags_mapping,
                                           sims_first_synset, sims_all_synsets, metric, ic_data)#bnc_ic_2007)
        print("Total pairs:", len(sims_first_synset))
        pickle.dump(sims_first_synset, open(metric+"_first_tokens_"+ic_name+"_"+date_str+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(sims_all_synsets, open(metric+"_all_tokens_"+ic_name+"_"+date_str+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
        #***********************************************

        #***********************************************
        print("Using only lemmas")
        print("Training dataset")
        sims_first_synset = {}
        sims_all_synsets = {}
        compute_msrpc_wordnet_sims_lemmas(train_pairs, vocab_lemmas, lemma2index, tags_mapping,
                                           sims_first_synset, sims_all_synsets, metric, ic_data)#bnc_ic_2007)
        print("Test dataset")
        compute_msrpc_wordnet_sims_lemmas(test_pairs, vocab_lemmas, lemma2index, tags_mapping,
                                           sims_first_synset, sims_all_synsets, metric, ic_data)#bnc_ic_2007)
        print("Total pairs:", len(sims_first_synset))
        pickle.dump(sims_first_synset, open(metric+"_first_lemmas_"+ic_name+"_"+date_str+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(sims_all_synsets, open(metric+"_all_lemmas_"+ic_name+"_"+date_str+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
        #***********************************************

        #***********************************************
        print("Using lemmas+pos")
        print("Training dataset")
        sims_first_synset = {}
        sims_all_synsets = {}
        compute_msrpc_wordnet_sims_lemmaspos(train_pairs, vocab_lemmapos, lemmapos2index, tags_mapping,
                                           sims_first_synset, sims_all_synsets, metric, ic_data)#bnc_ic_2007)
        print("Test dataset")
        compute_msrpc_wordnet_sims_lemmaspos(test_pairs, vocab_lemmapos, lemmapos2index, tags_mapping,
                                           sims_first_synset, sims_all_synsets, metric, ic_data)#bnc_ic_2007)
        print("Total pairs:", len(sims_first_synset))
        pickle.dump(sims_first_synset, open(metric+"_first_lemmapos_"+ic_name+"_"+date_str+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(sims_all_synsets, open(metric+"_all_lemmapos_"+ic_name+"_"+date_str+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
        #***********************************************
        print("\n")

METRIC: lin
Using only tokens
Training dataset
177413 new token pairs computed.
13452394 Wordnet similarity computed.
Test dataset
55911 new token pairs computed.
4225212 Wordnet similarity computed.
Total pairs: 233324
Using only lemmas
Training dataset
174284 new token pairs computed.
12462095 Wordnet similarity computed.
Test dataset
52244 new token pairs computed.
3635155 Wordnet similarity computed.
Total pairs: 226528
Using lemmas+pos
Training dataset
64372 new token pairs computed.
2129263 Wordnet similarity computed.
Test dataset
21304 new token pairs computed.
640119 Wordnet similarity computed.
Total pairs: 85676


METRIC: jcn
Using only tokens
Training dataset
227892 new token pairs computed.
15621875 Wordnet similarity computed.
Test dataset
73320 new token pairs computed.
4928741 Wordnet similarity computed.
Total pairs: 301212
Using only lemmas
Training dataset
215651 new token pairs computed.
13761833 Wordnet similarity computed.
Test dataset
65973 new token pairs comput

In [83]:
s = wn.synsets("hit", "n")[0]

In [84]:
import nltk.corpus.reader.wordnet as wn2

In [92]:
wn2._lcs_ic(s, s,bnc_ic_2007)

(8.907087158260376, 8.907087158260376, 8.907087158260376)

In [87]:
wn.jcn_similarity(s, s, bnc_ic_2007)

1e+300

In [88]:
wn.res_similarity(s, s, bnc_ic_2007)

8.907087158260376

In [90]:
wn.lin_similarity(s, s, bnc_ic_2007)

1.0

In [91]:
wn2._INF

1e+300

##### Serializing the WordNet similarity scores

In [None]:
pickle.dump(sims_first_synset, open(metric+"_first_"+date+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(sims_all_synsets, open(metric+"_all_"+date+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

#### Computing the WordNet similarity scores of the vocabulary terms
We compute these scores in order to normalize the similarity measures that do not have a range between 0 and 1.

path, lch, wup, res, jcn, lin

In [None]:
for metric in ['path', 'lch', 'wup', 'res', 'jcn', 'lin']:
    print(metric)
    wn_tags = set(['v','n','r','a'])
    vocab_sim = {}
    #metric = "lch"

    # The tag is already a wordnet tag where it is one of the wn_tags
    for lemma, tag in vocab:
        idx = token_to_index[(lemma, tag)]
        if tag in wn_tags:
            syns_tk = wn.synsets(lemma=lemma, pos=tag)
            all_sims = []
            for s in syns_tk:
                try:
                    res = wordnet_similarity(s, s, metric=metric, ic=bnc_ic_2000)
                    if res > 0.0:
                        all_sims.append(res)
                except ZeroDivisionError:
                    pass
            if len(all_sims) > 0:
                vocab_sim[idx] = max(all_sims)

    print("Vocab size:", len(vocab))
    
    for pos in ['n','v','r','a']:
        print(pos)
        non_zero = [x for idx, x in vocab_sim.items() if x > 0.0 and index_to_token[idx][1]==pos]
    
        print("Sims computed:", len(vocab_sim))

        print("Max:", max(non_zero, default=-1))
        print("Min", min(non_zero, default=-1))
        print("Mean:", -1 if len(non_zero) == 0 else sum(non_zero)/len(non_zero))

    pickle.dump(vocab_sim, open("vocab_"+metric+"_"+date+".pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    print()

WordNet similarity ranges:
- path: 0.0 - 1.0
- lch:
    - nouns: 0.0 - 3.6375861597263857
    - verbs: 0.0 - 3.258096538021482
- wup: 0.0 - 1.0
- res: 0.0 - 1e+300    **This number (inf) is too big. Need to check the maximum value in the similarity file**
- jcn: 0.0 - 1e+300    **This number (inf) is too big. Need to check the maximum value in the similarity file**
- lin: 0.0 - 1.0

**Need to check that all the similarities computed fall within these ranges**

**Questions**

- Shall we generate an index for the words or not? **Yes. Done!**
- Shall we remove the tokens composed by only punctuation symbols? **Yes. Done!**
- Shall we replace the numbers by a unique identifier?
- What is the range of each WordNet similarity measure? **Done for our specific corpus!**
- How to serialize the Information Count from British National Corpus (BNC)? **I implemented it manually. Done!**

**Tasks**

- Define the similarities needed in both datasets. **Done!**
- Run example for all WordNet similarity measures. **Done!**
- Serialize all the similarities needed in both, training and test dataset for each of the WordNet similarities. Try with first synset and with max of all synsets. **Done!**
    - Save the values without normalization. We need to know the ranges of the WordNet metrics or the maximum value to normalize. **Done for MSRPC!**
- When serializing the WordNet similarity values, we also need to serialize the vocabulary, token_to_index, and index_to_token data structures. **Done**

**Notes**
- The WordNet metrics that need normalization, because their values are not in the 0-1 range, won't be easily normalized because there is not a score for the similarity between the same lemmas other than Inf.
    - An idea to normalize is compute the similarity of a large amount of words between themself and pick the highest value.

#### Computing IDF over the BNC Corpus

In [None]:
def compute_idf(corpus):
    df = {}
    idf = {}
    for fid in corpus.fileids():
        words = set([token.lower() for token in corpus.words(fid)])
        for word in words:
            df[word] = df.get(word, 0) + 1
    N = len(corpus.fileids())
    for word in df:
        idf[word] = np.log(N/df[word])
    return idf

In [None]:
#corpus = nltk.corpus.brown

# BNC 2007 XML version
corpus = nltk.corpus.reader.BNCCorpusReader(root='D:/Corpus/2554/2554/download/Texts/',
                                            fileids=r'[A-K]/\w*/\w*.xml') 
idf = compute_idf(corpus)
len(idf)

In [None]:
pickle.dump(idf, open("idf_20170810.pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)