In [2]:
import pickle
import math
import nltk
from nltk.corpus import wordnet as wn
import numpy as np
import string
import itertools
import datetime
import sys
import re
import os
import Stemmer

# Functions

### Similarity Metrics

#### Softcosine similarity function
$softcosine(a,b)=\frac{ \sum_{i}^{N}{ \sum_{j}^{N}{s_{ij} a_i b_j} } }{ \sqrt{ \sum{ \sum_{i,j}^{N}{s_{ij} a_i a_j} } } \sqrt{ \sum{ \sum_{i,j}^{N}{s_{ij} b_i b_j} } }} $

In [3]:
def soft_euclidean_norm(vect, ft_id2wn_id, term_sims):
    """Compute the soft euclidean norm given a vector <v> in dict format and a """
    norm = 0.0
    for token1_id in vect:
        for token2_id in vect:
            # Here me need to check if (a_i, a_j) is the same as (a_j, a_i)******************************
            # The similarity between the same terms is 1
            if token1_id == token2_id:
                norm += vect[token1_id] * vect[token2_id]
            else:
                sims = []
                for wn_id1 in ft_id2wn_id[token1_id]:
                    for wn_id2 in ft_id2wn_id[token2_id]:
                        sim_key = (wn_id1, wn_id2) if wn_id1 > wn_id2 else (wn_id2, wn_id1)
                        if sim_key in term_sims:
                            sims.append(term_sims[sim_key])
                if len(sims) > 0:
                    norm += max(sims) * vect[token1_id] * vect[token2_id]
    return math.sqrt(norm)

In [4]:
def softcosine(vect1, vect2, ft_id2wn_id, term_sims):
    """Compute the softcosine similarity given two vectors
    <a> and <b> in a dictionary format, and the ."""
    dot_prod = 0.0
    det = soft_euclidean_norm(vect1, ft_id2wn_id, term_sims) * soft_euclidean_norm(vect2, ft_id2wn_id, term_sims)
    if det == 0.0:
        return 0.0
    for token1_id in vect1:
        for token2_id in vect2:
            if token1_id == token2_id:
                dot_prod += vect1[token1_id] * vect2[token2_id]
            else:
                sims = []
                for wn_id1 in ft_id2wn_id[token1_id]:
                    for wn_id2 in ft_id2wn_id[token2_id]:
                        sim_key = (wn_id1, wn_id2) if wn_id1 > wn_id2 else (wn_id2, wn_id1)
                        if sim_key in term_sims:
                            sims.append(term_sims[sim_key])
                if len(sims) > 0:
                    dot_prod += max(sims) * vect1[token1_id] * vect2[token2_id]
    return dot_prod / det

#### Cosine similarity function
$cosine(a,b)=\frac{a \cdot b}{|a| |b|}=\frac{\sum_{i}^{N}{a_i b_i}}{\sqrt{ \sum_{i}^{N}{a_i^2} } \sqrt{ \sum_{i}^{N}{b_i^2} }}$

In [5]:
def euclidean_norm(vect):
    norm = 0.0
    for token_id in vect:
        norm += vect[token_id]**2
    return math.sqrt(norm)

In [6]:
def cosine(vect1, vect2):
    dot_prod = 0.0
    det = euclidean_norm(vect1) * euclidean_norm(vect2)
    if det == 0.0:
        return 0.0
    for token_id in vect1:
        if token_id in vect2:
            dot_prod += vect1[token_id] * vect2[token_id]
    return dot_prod / det

#### Stevenson similarity metric
Fernando, S. and Stevenson, M. (2008). **A semantic similarity approach to paraphrase detection**, *Computational Linguistics UK (CLUK 2008) 11th Annual Research Colloquium*

$stevenson\_sim(a,b)=\frac{a \cdot W \cdot b}{|a| |b|}=\frac{ \sum_{i}^{N}{ \sum_{j}^{N}{W_{ij} a_i b_j} } }{\sqrt{ \sum_{i}^{N}{a_i^2} } \sqrt{ \sum_{i}^{N}{b_i^2} }}$

In [7]:
def stevenson_sim(vect1, vect2, ft_id2wn_id, term_sims):
    dot_prod = 0.0
    det = euclidean_norm(vect1) * euclidean_norm(vect2)
    if det == 0.0:
        return 0.0
    for token1_id in vect1:
        for token2_id in vect2:
            if token1_id == token2_id:
                dot_prod += vect1[token1_id] * vect2[token2_id]
            else:
                sims = []
                for wn_id1 in ft_id2wn_id[token1_id]:
                    for wn_id2 in ft_id2wn_id[token2_id]:
                        sim_key = (wn_id1, wn_id2) if wn_id1 > wn_id2 else (wn_id2, wn_id1)
                        if sim_key in term_sims:
                            sims.append(term_sims[sim_key])
                if len(sims) > 0:
                    dot_prod += max(sims) * vect1[token1_id] * vect2[token2_id]
    return dot_prod / det

#### Mihalcea's similarity method

In [8]:
def mihalcea_sim(vect1, vect2, ft_id2wn_id, ft_id2token_id, term_sims, idf, idf_median_value):
    sum1 = 0.0
    sum_idf1 = 0.0
    for token1_id in vect1:
        max_sim = 0.0
        for token2_id in vect2:
            if token1_id == token2_id:
                max_sim = 1.0
                break
            for wn_id1 in ft_id2wn_id[token1_id]:
                for wn_id2 in ft_id2wn_id[token2_id]:
                    sim_key = (wn_id1, wn_id2) if wn_id1 > wn_id2 else (wn_id2, wn_id1)
                    if sim_key in term_sims:
                        sval = term_sims[sim_key]
                        max_sim = sval if sval > max_sim else max_sim
        #idf_val = idf.get(list(ft_id2token_id[token1_id])[0], idf_median_value)
        idf_val = max([idf.get(x, idf_median_value) for x in ft_id2token_id[token1_id]])
        sum1 += max_sim * (idf_val)
        sum_idf1 += idf_val
        
    sum2 = 0.0
    sum_idf2 = 0.0
    for token2_id in vect2:
        max_sim = 0.0
        for token1_id in vect1:
            if token2_id == token1_id:
                max_sim = 1.0
                break
            for wn_id2 in ft_id2wn_id[token2_id]:
                for wn_id1 in ft_id2wn_id[token1_id]:
                    sim_key = (wn_id1, wn_id2) if wn_id1 > wn_id2 else (wn_id2, wn_id1)
                    if sim_key in term_sims:
                        sval = term_sims[sim_key]
                        max_sim = sval if sval > max_sim else max_sim
        #idf_val = idf.get(list(ft_id2token_id[token2_id])[0], idf_median_value)
        idf_val = max([idf.get(x, idf_median_value) for x in ft_id2token_id[token2_id]])
        sum2 += max_sim * (idf_val)
        sum_idf2 += idf_val
        
    return 1/2 * (sum1/sum_idf1 + sum2/sum_idf2)

### Function to compute similarity metrics for all pairs in MSRPCorpus
Training instances

**TODO**
- Implement the tf-idf and log enthropy measures.
    - Compute the the idf and enthropy over some of the existing corpora
- Implement Mihalcea's method

In [9]:
def compute_sims(pairs, vectors, feature, ft_id2wn_id, ft_id2token_id, metric="cosine",
                 weight_scheme="tf", term_sims={}, idf={}, idf_median_value=0.0):
    res = []
    for text_id1, text_id2 in pairs:
        # Vectorization
        if weight_scheme == "tf":
            v1 = nltk.FreqDist([x[feature] for x in vectors[text_id1]])
            v2 = nltk.FreqDist([x[feature] for x in vectors[text_id2]])
            #v1 = nltk.FreqDist([eval(feature+"_id") for token_id, lemma_id, stem_id, lemmapos_id in vectors[text_id1]])
            #v2 = nltk.FreqDist([eval(feature+"_id") for token_id, lemma_id, stem_id, lemmapos_id in vectors[text_id2]])
        elif weight_scheme == "binary":
            v1 = {}.fromkeys(set([x[feature] for x in vectors[text_id1]]), 1)
            v2 = {}.fromkeys(set([x[feature] for x in vectors[text_id2]]), 1)
            #v1 = {}.fromkeys(set([eval(feature+"_id") for token_id, lemma_id, stem_id, lemmapos_id in vectors[text_id1]]), 1)
            #v2 = {}.fromkeys(set([eval(feature+"_id") for token_id, lemma_id, stem_id, lemmapos_id in vectors[text_id2]]), 1)
        else:
            print("Unrecognized weighting scheme")
            raise
        
        # Similarity computation
        if metric == "softcosine":
            res.append(softcosine(v1, v2, ft_id2wn_id, term_sims))
        elif metric == "cosine":
            res.append(cosine(v1, v2))
        elif metric == "stevenson":
            res.append(stevenson_sim(v1, v2, ft_id2wn_id, term_sims))
        elif metric == "mihalcea":
            res.append(mihalcea_sim(v1, v2, ft_id2wn_id, ft_id2token_id, term_sims, idf, idf_median_value))
        else:
            print("Unimplemented similarity metric")
            raise
        #break
    return res

#### Optimizing the similarity threshold

In [10]:
def compute_scores(pair_sims, y, thresholds):
    """Receives a float between 0 and 1, or an iterable that returns such floats.
    The function returns the <tuple>/<list of tuples> of scores
    (threshold, accuracy, f_measure, precision, recall)"""
    # (accuracy, f_measure, precision, recall)
    if hasattr(thresholds, '__iter__'):
        measures = []
        for th in thresholds: #range(5, 101, 5):
            results = [1 if val >= th else 0 for val in pair_sims]
            measures.append((th,)+evaluate(results, y))
        return measures
    else:
        results = [1 if val >= thresholds else 0 for val in pair_sims]
        return (thresholds,)+evaluate(results, y)

### Preprocessing

#### Check if a token is made purely of punctuation symbols

In [11]:
def is_punct_token(token, puncts):
    if sum([1 for c in token if c in puncts]) == len(token):
        return True
    return False

### Evaluation metrics
accuracy, f_measure, precision, recall

In [12]:
def evaluate(result, truth):
    """Receive the gold_standard and values returned by our system.
    Return a tuple as (accuracy, f_measure, precision, recall)"""
    tp, fp, tn, fn = 0, 0, 0, 0
    for r, t in zip(result, truth):
        if t == 1:
            if r == 1:
                tp += 1
            else:
                fn += 1
        else:
            if r == 1:
                fp += 1
            else:
                tn += 1
    accuracy = 0.0 if tp+tn+fp+fn == 0 else (tp+tn) / (tp+tn+fp+fn)
    precision = 0.0 if tp+fp == 0 else tp / (tp+fp)
    recall = 0.0 if (tp+fn) == 0 else tp / (tp+fn) 
    f_measure = (0 if precision + recall == 0 else 
                 2*precision*recall/(precision+recall))
    return accuracy, f_measure, precision, recall

### Normalization strategy for each WordNet metric

In [13]:
def normalize(term_sims, metric):
    if metric == "path":
        # Not need for normalization.
        # Creating a copy of the similarity matrix
        return dict(term_sims.items())
    
    elif metric == "lch":
        # MinMax normalization
        data = np.array([val for (tkid1, tkid2), val in term_sims.items()])
        _max = np.max(data)
        _min = np.min(data)
        res = {}
        for key, val in term_sims.items():
            res[key] = (val - _min)/(_max - _min)
        return res
    
    elif metric == "wup":
        # Creating a copy of the similarity matrix
        return dict(term_sims.items())
    
    elif metric == "res":
        # MinMax normalization except for extremly big values (WordNet Inf=1e300)
        data = np.array([val for (tkid1, tkid2), val in term_sims.items() if val < 10000])
        _max = np.max(data)
        _min = np.min(data)
        res = {}
        for key, val in term_sims.items():
            # Exception
            if val > 10000:
                res[key] = 1.0
            else:
                res[key] = (val - _min)/(_max - _min)
        return res
    
    elif metric == "jcn":
        # MinMax normalization except for extremly big values (WordNet Inf=1e300)
        data = np.array([val for (tkid1, tkid2), val in term_sims.items() if val < 10000])
        _max = np.max(data)
        _min = np.min(data)
        res = {}
        for key, val in term_sims.items():
            # Exception
            if val > 10000:
                res[key] = 1.0
            else:
                res[key] = (val - _min)/(_max - _min)
        return res
    
    elif metric == "lin":
        # Not need for normalization.
        # Creating a copy of the similarity matrix
        return dict(term_sims.items())
    print("Unrecognize metric")
    raise

# MAIN

#### Loading the parsed MSRPCorpus

In [14]:
[parsed_texts,
 train_pairs,
 train_y,
 test_pairs,
 test_y] = pickle.load(open("msrpc_parsed_20170821.pickle", 'rb'))

In [15]:
[vocab_tokens, token2index, index2token] = pickle.load(open("tokens_data_20170821.pickle", 'rb'))
[vocab_lemmas, lemma2index, index2lemma] = pickle.load(open("lemmas_data_20170821.pickle", 'rb'))
[vocab_stems, stem2index, index2stem] = pickle.load(open("stems_data_20170821.pickle", 'rb'))
[vocab_lemmapos, lemmapos2index, index2lemmapos] = pickle.load(open("lemmapos_data_20170821.pickle", 'rb'))

#### Loading WordNet similarity "matrix"
It is a dictionary (token1_id, token2_id): sim_value, where token1_id > token2_id

In [16]:
# PATH Similarity
path_all_tokens   = pickle.load(open("path_all_tokens_20170821.pickle", 'rb'))
path_all_lemmas   = pickle.load(open("path_all_lemmas_20170821.pickle", 'rb'))
path_all_lemmapos = pickle.load(open("path_all_lemmapos_20170821.pickle", 'rb'))

path_first_tokens   = pickle.load(open("path_first_tokens_20170821.pickle", 'rb'))
path_first_lemmas   = pickle.load(open("path_first_lemmas_20170821.pickle", 'rb'))
path_first_lemmapos = pickle.load(open("path_first_lemmapos_20170821.pickle", 'rb'))


# LCH Similarity
lch_all_tokens   = pickle.load(open("lch_all_tokens_20170821.pickle", 'rb'))
lch_all_lemmas   = pickle.load(open("lch_all_lemmas_20170821.pickle", 'rb'))
lch_all_lemmapos = pickle.load(open("lch_all_lemmapos_20170821.pickle", 'rb'))

lch_first_tokens   = pickle.load(open("lch_first_tokens_20170821.pickle", 'rb'))
lch_first_lemmas   = pickle.load(open("lch_first_lemmas_20170821.pickle", 'rb'))
lch_first_lemmapos = pickle.load(open("lch_first_lemmapos_20170821.pickle", 'rb'))


# WUP Similarity
wup_all_tokens   = pickle.load(open("wup_all_tokens_20170821.pickle", 'rb'))
wup_all_lemmas   = pickle.load(open("wup_all_lemmas_20170821.pickle", 'rb'))
wup_all_lemmapos = pickle.load(open("wup_all_lemmapos_20170821.pickle", 'rb'))

wup_first_tokens   = pickle.load(open("wup_first_tokens_20170821.pickle", 'rb'))
wup_first_lemmas   = pickle.load(open("wup_first_lemmas_20170821.pickle", 'rb'))
wup_first_lemmapos = pickle.load(open("wup_first_lemmapos_20170821.pickle", 'rb'))


# RES Similarity
res_all_tokens_bnc_ic_2007   = pickle.load(open("res_all_tokens_bnc_ic_2007_20170821.pickle", 'rb'))
res_all_tokens_bnc_ic_2000   = pickle.load(open("res_all_tokens_bnc_ic_2000_20170821.pickle", 'rb'))
res_all_tokens_semcor_ic     = pickle.load(open("res_all_tokens_semcor_ic_20170821.pickle", 'rb'))
res_all_tokens_brown_ic      = pickle.load(open("res_all_tokens_brown_ic_20170821.pickle", 'rb'))

res_all_lemmas_bnc_ic_2007   = pickle.load(open("res_all_lemmas_bnc_ic_2007_20170821.pickle", 'rb'))
res_all_lemmas_bnc_ic_2000   = pickle.load(open("res_all_lemmas_bnc_ic_2000_20170821.pickle", 'rb'))
res_all_lemmas_semcor_ic     = pickle.load(open("res_all_lemmas_semcor_ic_20170821.pickle", 'rb'))
res_all_lemmas_brown_ic      = pickle.load(open("res_all_lemmas_brown_ic_20170821.pickle", 'rb'))

res_all_lemmapos_bnc_ic_2007   = pickle.load(open("res_all_lemmapos_bnc_ic_2007_20170821.pickle", 'rb'))
res_all_lemmapos_bnc_ic_2000   = pickle.load(open("res_all_lemmapos_bnc_ic_2000_20170821.pickle", 'rb'))
res_all_lemmapos_semcor_ic     = pickle.load(open("res_all_lemmapos_semcor_ic_20170821.pickle", 'rb'))
res_all_lemmapos_brown_ic      = pickle.load(open("res_all_lemmapos_brown_ic_20170821.pickle", 'rb'))

res_first_tokens_bnc_ic_2007   = pickle.load(open("res_first_tokens_bnc_ic_2007_20170821.pickle", 'rb'))
res_first_tokens_bnc_ic_2000   = pickle.load(open("res_first_tokens_bnc_ic_2000_20170821.pickle", 'rb'))
res_first_tokens_semcor_ic     = pickle.load(open("res_first_tokens_semcor_ic_20170821.pickle", 'rb'))
res_first_tokens_brown_ic      = pickle.load(open("res_first_tokens_brown_ic_20170821.pickle", 'rb'))

res_first_lemmas_bnc_ic_2007   = pickle.load(open("res_first_lemmas_bnc_ic_2007_20170821.pickle", 'rb'))
res_first_lemmas_bnc_ic_2000   = pickle.load(open("res_first_lemmas_bnc_ic_2000_20170821.pickle", 'rb'))
res_first_lemmas_semcor_ic     = pickle.load(open("res_first_lemmas_semcor_ic_20170821.pickle", 'rb'))
res_first_lemmas_brown_ic      = pickle.load(open("res_first_lemmas_brown_ic_20170821.pickle", 'rb'))

res_first_lemmapos_bnc_ic_2007   = pickle.load(open("res_first_lemmapos_bnc_ic_2007_20170821.pickle", 'rb'))
res_first_lemmapos_bnc_ic_2000   = pickle.load(open("res_first_lemmapos_bnc_ic_2000_20170821.pickle", 'rb'))
res_first_lemmapos_semcor_ic     = pickle.load(open("res_first_lemmapos_semcor_ic_20170821.pickle", 'rb'))
res_first_lemmapos_brown_ic      = pickle.load(open("res_first_lemmapos_brown_ic_20170821.pickle", 'rb'))


# JCN Similarity
jcn_all_tokens_bnc_ic_2007   = pickle.load(open("jcn_all_tokens_bnc_ic_2007_20170821.pickle", 'rb'))
jcn_all_tokens_bnc_ic_2000   = pickle.load(open("jcn_all_tokens_bnc_ic_2000_20170821.pickle", 'rb'))
jcn_all_tokens_semcor_ic     = pickle.load(open("jcn_all_tokens_semcor_ic_20170821.pickle", 'rb'))
jcn_all_tokens_brown_ic      = pickle.load(open("jcn_all_tokens_brown_ic_20170821.pickle", 'rb'))

jcn_all_lemmas_bnc_ic_2007   = pickle.load(open("jcn_all_lemmas_bnc_ic_2007_20170821.pickle", 'rb'))
jcn_all_lemmas_bnc_ic_2000   = pickle.load(open("jcn_all_lemmas_bnc_ic_2000_20170821.pickle", 'rb'))
jcn_all_lemmas_semcor_ic     = pickle.load(open("jcn_all_lemmas_semcor_ic_20170821.pickle", 'rb'))
jcn_all_lemmas_brown_ic      = pickle.load(open("jcn_all_lemmas_brown_ic_20170821.pickle", 'rb'))

jcn_all_lemmapos_bnc_ic_2007   = pickle.load(open("jcn_all_lemmapos_bnc_ic_2007_20170821.pickle", 'rb'))
jcn_all_lemmapos_bnc_ic_2000   = pickle.load(open("jcn_all_lemmapos_bnc_ic_2000_20170821.pickle", 'rb'))
jcn_all_lemmapos_semcor_ic     = pickle.load(open("jcn_all_lemmapos_semcor_ic_20170821.pickle", 'rb'))
jcn_all_lemmapos_brown_ic      = pickle.load(open("jcn_all_lemmapos_brown_ic_20170821.pickle", 'rb'))

jcn_first_tokens_bnc_ic_2007   = pickle.load(open("jcn_first_tokens_bnc_ic_2007_20170821.pickle", 'rb'))
jcn_first_tokens_bnc_ic_2000   = pickle.load(open("jcn_first_tokens_bnc_ic_2000_20170821.pickle", 'rb'))
jcn_first_tokens_semcor_ic     = pickle.load(open("jcn_first_tokens_semcor_ic_20170821.pickle", 'rb'))
jcn_first_tokens_brown_ic      = pickle.load(open("jcn_first_tokens_brown_ic_20170821.pickle", 'rb'))

jcn_first_lemmas_bnc_ic_2007   = pickle.load(open("jcn_first_lemmas_bnc_ic_2007_20170821.pickle", 'rb'))
jcn_first_lemmas_bnc_ic_2000   = pickle.load(open("jcn_first_lemmas_bnc_ic_2000_20170821.pickle", 'rb'))
jcn_first_lemmas_semcor_ic     = pickle.load(open("jcn_first_lemmas_semcor_ic_20170821.pickle", 'rb'))
jcn_first_lemmas_brown_ic      = pickle.load(open("jcn_first_lemmas_brown_ic_20170821.pickle", 'rb'))

jcn_first_lemmapos_bnc_ic_2007   = pickle.load(open("jcn_first_lemmapos_bnc_ic_2007_20170821.pickle", 'rb'))
jcn_first_lemmapos_bnc_ic_2000   = pickle.load(open("jcn_first_lemmapos_bnc_ic_2000_20170821.pickle", 'rb'))
jcn_first_lemmapos_semcor_ic     = pickle.load(open("jcn_first_lemmapos_semcor_ic_20170821.pickle", 'rb'))
jcn_first_lemmapos_brown_ic      = pickle.load(open("jcn_first_lemmapos_brown_ic_20170821.pickle", 'rb'))


# LIN Similarity
lin_all_tokens_bnc_ic_2007   = pickle.load(open("lin_all_tokens_bnc_ic_2007_20170821.pickle", 'rb'))
lin_all_tokens_bnc_ic_2000   = pickle.load(open("lin_all_tokens_bnc_ic_2000_20170821.pickle", 'rb'))
lin_all_tokens_semcor_ic     = pickle.load(open("lin_all_tokens_semcor_ic_20170821.pickle", 'rb'))
lin_all_tokens_brown_ic      = pickle.load(open("lin_all_tokens_brown_ic_20170821.pickle", 'rb'))

lin_all_lemmas_bnc_ic_2007   = pickle.load(open("lin_all_lemmas_bnc_ic_2007_20170821.pickle", 'rb'))
lin_all_lemmas_bnc_ic_2000   = pickle.load(open("lin_all_lemmas_bnc_ic_2000_20170821.pickle", 'rb'))
lin_all_lemmas_semcor_ic     = pickle.load(open("lin_all_lemmas_semcor_ic_20170821.pickle", 'rb'))
lin_all_lemmas_brown_ic      = pickle.load(open("lin_all_lemmas_brown_ic_20170821.pickle", 'rb'))

lin_all_lemmapos_bnc_ic_2007   = pickle.load(open("lin_all_lemmapos_bnc_ic_2007_20170821.pickle", 'rb'))
lin_all_lemmapos_bnc_ic_2000   = pickle.load(open("lin_all_lemmapos_bnc_ic_2000_20170821.pickle", 'rb'))
lin_all_lemmapos_semcor_ic     = pickle.load(open("lin_all_lemmapos_semcor_ic_20170821.pickle", 'rb'))
lin_all_lemmapos_brown_ic      = pickle.load(open("lin_all_lemmapos_brown_ic_20170821.pickle", 'rb'))

lin_first_tokens_bnc_ic_2007   = pickle.load(open("lin_first_tokens_bnc_ic_2007_20170821.pickle", 'rb'))
lin_first_tokens_bnc_ic_2000   = pickle.load(open("lin_first_tokens_bnc_ic_2000_20170821.pickle", 'rb'))
lin_first_tokens_semcor_ic     = pickle.load(open("lin_first_tokens_semcor_ic_20170821.pickle", 'rb'))
lin_first_tokens_brown_ic      = pickle.load(open("lin_first_tokens_brown_ic_20170821.pickle", 'rb'))

lin_first_lemmas_bnc_ic_2007   = pickle.load(open("lin_first_lemmas_bnc_ic_2007_20170821.pickle", 'rb'))
lin_first_lemmas_bnc_ic_2000   = pickle.load(open("lin_first_lemmas_bnc_ic_2000_20170821.pickle", 'rb'))
lin_first_lemmas_semcor_ic     = pickle.load(open("lin_first_lemmas_semcor_ic_20170821.pickle", 'rb'))
lin_first_lemmas_brown_ic      = pickle.load(open("lin_first_lemmas_brown_ic_20170821.pickle", 'rb'))

lin_first_lemmapos_bnc_ic_2007   = pickle.load(open("lin_first_lemmapos_bnc_ic_2007_20170821.pickle", 'rb'))
lin_first_lemmapos_bnc_ic_2000   = pickle.load(open("lin_first_lemmapos_bnc_ic_2000_20170821.pickle", 'rb'))
lin_first_lemmapos_semcor_ic     = pickle.load(open("lin_first_lemmapos_semcor_ic_20170821.pickle", 'rb'))
lin_first_lemmapos_brown_ic      = pickle.load(open("lin_first_lemmapos_brown_ic_20170821.pickle", 'rb'))

In [17]:
idf = pickle.load(open("idf_bnc2007_20170810.pickle", 'rb'))

Possible Configurations TODO list
- Remove stopwords **Implemented!**
- Remove punctuation **Implemented!**
- Use binary or term frequency **Implemented!**
- Different kind of term_sims normalizations **Implemented!**
- Different WordNet term_sims **Implemented!**
- Softcosine or stevenson metric **Implemented!**
- Set as 0 all term similarity values below a given threshold **Implemented!**

#### Parameters

In [18]:
remove_stopwords_opt = [False, True]
remove_punctuation_opt = [False, True]
termsim_metric_opt = ["path", "lch", "wup", "res", "jcn", "lin"]
normalization_opt = [True, False]
termsim_threshold_opt = [0.0, 0.25, 0.5, 0.75]#np.linspace(0.0, 1.0, 21) # Every 5% # np.linspace(0.5, 1.0, 10)
synsets_taken_opt = ["all", "first"]
wscheme_opt = ["tf", "binary"]
sim_metric_opt = ["mihalcea", "softcosine", "stevenson"]

# This order should be kept given that it is used later
features_opt = ["token", "lemma", "stem", "lemmapos"]
synset_getter_opt = ["token", "lemma", "lemmapos"]
ic_data_opt = ["bnc_ic_2007", "bnc_ic_2000", "semcor_ic", "brown_ic"]

In [19]:
for x in synset_getter_opt:
    print("vocab_"+x+ ('' if x[-1]=='s' else 's'))

vocab_tokens
vocab_lemmas
vocab_lemmapos


# Running experiments with different combinations of parameters

#### Generating configurations
Generating the configurations trying to avoid any unnecessary repetition

configurations = list(itertools.product(remove_stopwords_opt,
                                        remove_punctuation_opt,
                                        termsim_metric_opt,
                                        normalization_opt,
                                        #termsim_threshold_opt,
                                        synsets_taken_opt,
                                        wscheme_opt,
                                        sim_metric_opt))

In [20]:
# No normalization, No ic_data
configurations = list(itertools.product(remove_stopwords_opt,
                                        [True], #remove_punctuation_opt,
                                        ["path", "wup"], # metrics
                                        features_opt,
                                        synset_getter_opt,
                                        [None], # ic_data not needed
                                        [False], # normalization not needed
                                        termsim_threshold_opt,
                                        synsets_taken_opt,
                                        wscheme_opt,
                                        sim_metric_opt))

# No normalization, Yes ic_data
configurations += list(itertools.product(remove_stopwords_opt,
                                         [True], # remove_punctuation_opt,
                                         ["lin"], # metric
                                         features_opt,
                                         synset_getter_opt,
                                         ic_data_opt, # ic_data needed
                                         [False], # normalization not needed
                                         termsim_threshold_opt,
                                         synsets_taken_opt,
                                         wscheme_opt,
                                         sim_metric_opt))

# Yes normalization, No ic_data
configurations += list(itertools.product(remove_stopwords_opt,
                                         [True], # remove_punctuation_opt,
                                         ["lch"], # metric
                                         features_opt,
                                         synset_getter_opt,
                                         [None], # ic_data not needed
                                         [True], # normalization needed
                                         termsim_threshold_opt,
                                         synsets_taken_opt,
                                         wscheme_opt,
                                         sim_metric_opt))

# Yes normalization, Yes ic_data
configurations += list(itertools.product(remove_stopwords_opt, 
                                         [True], # remove_punctuation_opt,
                                         ["jcn", "res"], # metrics
                                         features_opt,
                                         synset_getter_opt,
                                         ic_data_opt, # ic_data needed
                                         [True], # normalization needed
                                         termsim_threshold_opt,
                                         synsets_taken_opt,
                                         wscheme_opt,
                                         sim_metric_opt))

#### Converting from string to indexes given certain pre-processing options
Preprocessing options:
- Removing stopwords
- Removing punctuation

In [21]:
tags_mapping = {"NN":wn.NOUN, "VB":wn.VERB, "JJ":wn.ADJ, "RB":wn.ADV}
stopwords = set(nltk.corpus.stopwords.words("english"))
puncts = set(string.punctuation)
vectors_no_prep = {}
vectors_no_sw = {}
vectors_no_punct = {}
vectors_prep = {}
for idx in parsed_texts:   
    vectors_no_prep[idx] = [(token2index[token],
                             lemma2index[lemma],
                             stem2index[stem],
                             lemmapos2index[(lemma, tags_mapping.get(tag[:2], tag))])
                            for token, lemma, stem, tag in parsed_texts[idx]
                           ]
    
    vectors_no_sw[idx] = [(token2index[token],
                           lemma2index[lemma],
                           stem2index[stem],
                           lemmapos2index[(lemma, tags_mapping.get(tag[:2], tag))])
                          for token, lemma, stem, tag in parsed_texts[idx]
                          if token not in stopwords # Removing stopwords
                         ]

    vectors_no_punct[idx] = [(token2index[token],
                              lemma2index[lemma],
                              stem2index[stem],
                              lemmapos2index[(lemma, tags_mapping.get(tag[:2], tag))])
                             for token, lemma, stem, tag in parsed_texts[idx]
                             if not is_punct_token(token, puncts) # Removing punctuation
                            ]
    
    vectors_prep[idx] = [(token2index[token],
                          lemma2index[lemma],
                          stem2index[stem],
                          lemmapos2index[(lemma, tags_mapping.get(tag[:2], tag))])
                         for token, lemma, stem, tag in parsed_texts[idx]
                         if token not in stopwords # Removing stopwords
                         and not is_punct_token(token, puncts) # Removing punctuation
                        ]

#### Mapping feature indexes to wordnet getter indexes
For the particular case of the MSRP Corpus

In [22]:
mappings = {}
for feature in features_opt:
    for syngetter in synset_getter_opt:
        comb_dict = {}
        for idx in vectors_no_prep:
            for token_id, lemma_id, stem_id, lemmapos_id in vectors_no_prep[idx]:
                ft_idx = eval(feature+"_id")
                wn_idx = eval(syngetter+"_id")       
                if ft_idx in comb_dict:
                    comb_dict[ft_idx].add(wn_idx)
                else:
                    comb_dict[ft_idx] = set([wn_idx])
        mappings[(feature, syngetter)] = comb_dict
        

#### Experiments with given configurations in the Training Corpus

In [23]:
idf_median_value = np.median(list(idf.values()))
idf_median_value

8.30622521603216

In [24]:
all_scores = {}
count = 0
total_configurations = len(configurations)
print("Progress ...")
for configuration in configurations:
    (remove_stopwords,
     remove_punctuation, 
     termsim_metric,
     feature,
     syngetter,
     ic_data,
     normalization,
     termsim_th,
     synsets_taken,
     wscheme,
     sim_metric) = configuration
    
    # Converting from word vectors to index vectors from the vocabulary
    vocab_syngetter = eval("vocab_"+syngetter+ ('' if syngetter[-1]=='s' else 's'))
    syngetter2index = eval(syngetter+"2index")
    index2syngetter = eval("index2"+syngetter)
    
    vocab_feature = eval("vocab_"+feature+ ('' if feature[-1]=='s' else 's'))
    feature2index = eval(feature+"2index")
    index2feature = eval("index2"+feature)
    
    # Choosing the appropiated preprocessed vecotrs
    if remove_stopwords:
        if remove_punctuation:
            vectors = vectors_prep
        else:
            vectors = vectors_no_sw
    else:
        if remove_punctuation:
            vectors = vectors_no_punct
        else:
            vectors = vectors_no_prep
        
    
    # Choosing the term similarity matrix
    if ic_data:
        term_sims = eval(termsim_metric+"_"+synsets_taken+"_"+syngetter+('' if syngetter[-1]=='s' else 's')+"_"+ic_data)
    else:
        term_sims = eval(termsim_metric+"_"+synsets_taken+"_"+syngetter+('' if syngetter[-1]=='s' else 's'))
    
    # Normalizing the term similarity matrix
    if normalization:
        term_sims = normalize(term_sims, termsim_metric)
        
    # Setting to zero term similarities below termsim_th
    term_sims = dict((key, value) for key, value in term_sims.items() if value >= termsim_th)
    
    # Computing pair of texts similarities
    sims = compute_sims(train_pairs,
                        vectors,
                        features_opt.index(feature),
                        mappings[(feature, syngetter)],
                        mappings[(feature, 'token')],
                        weight_scheme=wscheme,
                        metric=sim_metric,
                        term_sims=term_sims,
                        idf=idf,
                        idf_median_value=idf_median_value
                       )
    #sims = compute_sims(test_pairs, vectors, weight_scheme=wscheme, metric=sim_metric, term_sims=term_sims)
    
    # Computing scores
    scores = np.array(compute_scores(sims, train_y, np.linspace(0.05, 1.0, 20))) # [x/100.0 for x in range(5, 101, 5)]))
    #scores = np.array(compute_scores(sims, test_y, np.linspace(0.05, 1.0, 20))) # [x/100.0 for x in range(5, 101, 5)]))
    
    all_scores[configuration] = scores[scores[:,1].argsort()[::-1]]
    
    #print(configuration)
    #print(all_scores[configuration][:3], "\n\n")
    #break
    sys.stdout.write('\r')
    # the exact output you're looking for:
    count += 1
    i = count * 40.0 / total_configurations
    sys.stdout.write("[{:40}] {:.0f}%   {}/{}".format('='*int(i), int(2.5*i), count, total_configurations))
    sys.stdout.flush()

Progress ...
[==                                      ] 5%   964/17280

KeyboardInterrupt: 

##### Saving the results in HD

In [25]:
date_obj = datetime.date.today()
date_str = "{:04d}".format(date_obj.year) + "{:02d}".format(date_obj.month) + "{:02d}".format(date_obj.day) 
date_str

'20180504'

In [26]:
pickle.dump(all_scores, open("results_"+date_str+"_simple_normalization.pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

#### Extracting best results

Configuration format
1. Removing stopwords flag  :  [True, False]

2. Removing punctuation flag  :  [True, False]

3. Wordnet similarity metrics  :  ["path", "lch", "wup", "res", "jcn", "lin"]

4. Features extracted to compute similarity  :  ["token", "lemma", "stem", "lemmapos"]

5. Features used to extract synsets  :  ["token", "lemma", "stem", "lemmapos"]

6. Information Content used in some WordNet metrics  :  ["bnc_ic_2007", "bnc_ic_2000", "semcor_ic", "brown_ic"]

7. Normalization flag : [True, False]

8. Term-Term similarity minimum threslhold  :  [0.0, 0.25, 0.5, 0.75]            

9. Synsets selection strategy (all-vs-all, first)  :  ["all", "first"]

10. Features weighting scheme  :  ["tf", "binary"]

11. Text similarity method  :  ["mihalcea", "softcosine", "stevenson"]

In [24]:
# res = (threshold, accuracy, f_measure, precision, recall)
max([(conf, res[0][0], res[0][1], res[0][2])
     for conf, res in all_scores.items()
     #if conf[2] == "path"
    ]
    , key=lambda x:x[3])
#[(conf, res[0][0], res[0][1]) for conf, res in all_scores.items()]

((False,
  True,
  'res',
  'stem',
  'lemmapos',
  'semcor_ic',
  True,
  0.0,
  'first',
  'tf',
  'mihalcea'),
 0.54999999999999993,
 0.73307163886162907,
 0.82779360557138326)

In [28]:
len(all_scores)

17280

In [34]:
all_scores[(False, True, 'jcn', 'lemma', 'lemma', 'semcor_ic', True, 0.0, 'all', 'binary', 'stevenson')]

array([[ 0.55      ,  0.72988224,  0.82227603,  0.73997676,  0.92517254],
       [ 0.6       ,  0.72669284,  0.80963773,  0.76444014,  0.8605158 ],
       [ 0.5       ,  0.72105005,  0.82434729,  0.7172043 ,  0.96912459],
       [ 0.65      ,  0.71393523,  0.7869152 ,  0.79183523,  0.78205594],
       [ 0.45      ,  0.70387635,  0.81781132,  0.69963843,  0.98401744],
       [ 0.7       ,  0.69357213,  0.75418225,  0.82302405,  0.69596803],
       [ 0.4       ,  0.68719333,  0.81085892,  0.68530592,  0.9927352 ],
       [ 0.35      ,  0.68204122,  0.80924345,  0.68027716,  0.99854704],
       [ 0.3       ,  0.67590775,  0.80644689,  0.67583497,  0.99963676],
       [ 0.15      ,  0.67566241,  0.80638547,  0.67558282,  1.        ],
       [ 0.2       ,  0.67566241,  0.80638547,  0.67558282,  1.        ],
       [ 0.25      ,  0.67566241,  0.80638547,  0.67558282,  1.        ],
       [ 0.1       ,  0.67541708,  0.80626739,  0.67541708,  1.        ],
       [ 0.05      ,  0.67541708,  0.8

In [27]:
all_scores[(True, True, 'jcn', 'lemmapos', 'lemmapos', 'bnc_ic_2000', True, 0.0, 'all', 'tf', 'mihalcea')]

array([[ 0.5       ,  0.71344455,  0.80946166,  0.73467575,  0.90119869],
       [ 0.45      ,  0.70878312,  0.8132767 ,  0.7172586 ,  0.93897566],
       [ 0.55      ,  0.70632974,  0.79222357,  0.75864362,  0.82891391],
       [ 0.4       ,  0.70019627,  0.8136627 ,  0.70118265,  0.96912459],
       [ 0.35      ,  0.69381747,  0.81261261,  0.69260302,  0.98292772],
       [ 0.6       ,  0.69332679,  0.76851852,  0.78390631,  0.75372321],
       [ 0.3       ,  0.6844946 ,  0.80919881,  0.68397291,  0.99055576],
       [ 0.1       ,  0.67566241,  0.80638547,  0.67558282,  1.        ],
       [ 0.15      ,  0.67541708,  0.80615385,  0.67558939,  0.99927352],
       [ 0.25      ,  0.67541708,  0.80558413,  0.67645607,  0.99564112],
       [ 0.05      ,  0.67541708,  0.80626739,  0.67541708,  1.        ],
       [ 0.2       ,  0.67443572,  0.80539669,  0.67535662,  0.99745732],
       [ 0.65      ,  0.67100098,  0.72826748,  0.82355637,  0.65274246],
       [ 0.7       ,  0.63076546,  0.6

#### Serializing all results

### Evaluating best configuration for each WordNet metric on the test set

Best-k configurations

In [None]:
for metric in ["path", "lch", "wup", "res", "jcn", "lin"]:
    print(metric)
    # conf = (0-remove_stopwords, 1-remove_punctuation, 2-termsim_metric, 3-normalization,
    #         4-termsim_th, 5-synsets_taken, 6-wscheme, 7-sim_metric)
    #
    # scores = (0-threshold, 1-accuracy, 2-f_measure, 3-precision, 4-recall)
    #
    for conf, score in sorted([(conf, scores[0,:])
                               for conf, scores in all_scores.items()
                               if conf[2] == metric],
                              key=lambda tup:tup[1][1], # tup=(conf, scores)
                              reverse=True)[:10]:
        print(conf)
        print(score, "\n")
    print("\n\n")
    #break

In [None]:
list(all_scores.items())[0][1][0,:]

In [None]:
options = [
           (False, True, 'jcn', True, 0.0, 'all', 'binary', 'softcosine'),
           (False, True, 'jcn', True, 0.0, 'all', 'binary', 'stevenson'),
           (False, True, 'jcn', True, 0.0, 'all', 'binary', 'stevenson')
          ]

# Loading best configurations from training

In [30]:
test_configurations = pickle.load(open("test_configurations.pickle", "rb"))

# Evaluating best configurations

In [31]:
test_scores = {}
count = 0
total_configurations = len(test_configurations)
print("Progress ...")
for configuration in test_configurations:
    (remove_stopwords,
     remove_punctuation, 
     termsim_metric,
     feature,
     syngetter,
     ic_data,
     normalization,
     termsim_th,
     synsets_taken,
     wscheme,
     sim_metric) = configuration
    
    # Converting from word vectors to index vectors from the vocabulary
    vocab_syngetter = eval("vocab_"+syngetter+ ('' if syngetter[-1]=='s' else 's'))
    syngetter2index = eval(syngetter+"2index")
    index2syngetter = eval("index2"+syngetter)
    
    vocab_feature = eval("vocab_"+feature+ ('' if feature[-1]=='s' else 's'))
    feature2index = eval(feature+"2index")
    index2feature = eval("index2"+feature)
    
    # Choosing the appropiated preprocessed vecotrs
    if remove_stopwords:
        if remove_punctuation:
            vectors = vectors_prep
        else:
            vectors = vectors_no_sw
    else:
        if remove_punctuation:
            vectors = vectors_no_punct
        else:
            vectors = vectors_no_prep
        
    
    # Choosing the term similarity matrix
    if ic_data:
        term_sims = eval(termsim_metric+"_"+synsets_taken+"_"+syngetter+('' if syngetter[-1]=='s' else 's')+"_"+ic_data)
    else:
        term_sims = eval(termsim_metric+"_"+synsets_taken+"_"+syngetter+('' if syngetter[-1]=='s' else 's'))
    
    # Normalizing the term similarity matrix
    if normalization:
        term_sims = normalize(term_sims, termsim_metric)
        
    # Setting to zero term similarities below termsim_th
    term_sims = dict((key, value) for key, value in term_sims.items() if value >= termsim_th)
    
    # Computing pair of texts similarities
    sims = compute_sims(test_pairs,
                        vectors,
                        features_opt.index(feature),
                        mappings[(feature, syngetter)],
                        mappings[(feature, 'token')],
                        weight_scheme=wscheme,
                        metric=sim_metric,
                        term_sims=term_sims,
                        idf=idf,
                        idf_median_value=idf_median_value
                       )
    #sims = compute_sims(test_pairs, vectors, weight_scheme=wscheme, metric=sim_metric, term_sims=term_sims)
    
    # Computing scores
    scores = np.array(compute_scores(sims, test_y, np.linspace(0.05, 1.0, 20))) # [x/100.0 for x in range(5, 101, 5)]))
    
    test_scores[configuration] = scores[scores[:,1].argsort()[::-1]]
    
    sys.stdout.write('\r')
    # the exact output you're looking for:
    count += 1
    i = count * 40.0 / total_configurations
    sys.stdout.write("[{:40}] {:.0f}%   {}/{}".format('='*int(i), int(2.5*i), count, total_configurations))
    sys.stdout.flush()

Progress ...

# Storing test results

In [33]:
with open("./Results/test_results_20180504.pickle", "wb") as fid:
    pickle.dump(test_scores, fid)