In [1]:
import csv
import os
import numpy as np
import en_core_web_lg
import wordfreq

from embedding_vectorizer import EmbeddingVectorizer

In [2]:
STS_BENCHMARK_DIR = './data/stsbenchmark/'
WORD_FREQUENCIES_FILE = './data/enwiki_vocab_min200.txt'

In [3]:
def load_sts_benchmark(directory):
    def load_dataset(directory, file):
        dataset = []
        with open(os.path.join(directory, file), 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t', quotechar='|')
            for row in reader:
                score = row[4]
                sent1 = row[5]
                sent2 = row[6]
                dataset.append((sent1, sent2, float(score)))
        return dataset
    
    datasets = map(lambda split: load_dataset(directory, 'sts-{}.csv'.format(split)), ['train', 'dev', 'test'])
    return tuple(datasets)

def load_word_frequencies(path):
    word_freq = {}
    N = 0
    with open(path, 'r') as freq_file:
        reader = csv.reader(freq_file, delimiter=' ', quotechar='|')
        for row in reader:
            word = row[0]
            freq = int(row[1])
            word_freq[word] = freq
            N += freq
    return {k: v / N for k, v in word_freq.items()}

def dataset_iter(dataset, yield_s1=False, yield_s2=False):
    for sent1, sent2, _ in dataset:
        if yield_s1:
            yield sent1
        if yield_s2:
            yield sent2

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
            
def similarity_scores(dataset, model):
    scores = []
    for s1, s2, _ in dataset:
        score = cosine_similarity(model.transform([s1])[0], model.transform([s2])[0])
        scores.append(score)
    return np.array(scores)

def pearson_r(true_scores, pred_scores):
    from scipy.stats import pearsonr
    return pearsonr(true_scores, pred_scores)[0] * 100

In [4]:
train, dev, test = load_sts_benchmark(STS_BENCHMARK_DIR)
word_freq = load_word_frequencies(WORD_FREQUENCIES_FILE)
nlp = en_core_web_lg.load()

In [18]:
def evaluate_vectorizer(train, test, word_vectorizer, tokenizer=None, word_freq=None,
                        weighted=True, remove_components=1, lowercase=True):
    
    print('Samples in training dataset:', len(train))
    print('Samples in test dataset', len(test))

    emb_vectorizer = EmbeddingVectorizer(
        tokenizer=tokenizer,
        word_vectorizer=word_vectorizer,
        word_freq=word_freq,
        weighted=weighted,
        remove_components=remove_components,
        lowercase=lowercase)
    
    print('Fitting on training dataset')
    emb_vectorizer.fit(dataset_iter(train, yield_s1=True, yield_s2=True))

    print('Computing similarity scores on test dataset')
    scores_pred = similarity_scores(test, emb_vectorizer)
    scores_true = [d[2] for d in test]

    print('Pearson`s r (x 100) on test dataset:', pearson_r(scores_true, scores_pred))

In [23]:
evaluate_vectorizer(train=train, test=dev,
                    word_vectorizer=lambda w: nlp.vocab[w].vector,
                    tokenizer=lambda s: [t.text for t in nlp.tokenizer(s) if not t.is_punct and not t.is_space],
                    word_freq=lambda w: word_freq.get(w, 0.0),
                    # word_freq=lambda w: wordfreq.word_frequency(w, 'en', wordlist='large'),
                    weighted=True,
                    remove_components=15,
                    lowercase=True)

Samples in training dataset: 5749
Samples in test dataset 1500
Fitting on training dataset
Computing similarity scores on test dataset
Pearson`s r (x 100) on test dataset: 80.2393160533


In [25]:
evaluate_vectorizer(train=train, test=test,
                    word_vectorizer=lambda w: nlp.vocab[w].vector,
                    tokenizer=lambda s: [t.text for t in nlp.tokenizer(s) if not t.is_punct and not t.is_space],
                    word_freq=lambda w: word_freq.get(w, 0.0),
                    weighted=True,
                    remove_components=15,
                    lowercase=True)

Samples in training dataset: 5749
Samples in test dataset 1379
Fitting on training dataset
Computing similarity scores on test dataset
Pearson`s r (x 100) on test dataset: 71.6725415272
