# STS Benchmark Datasets

In [1]:
%load_ext autotime
import numpy as np
import logging
import pandas as pd
import pathlib

from gensim.models import Word2Vec, FastText
from gensim.models.keyedvectors import KeyedVectors, FastTextKeyedVectors

from fse.models.average import Average
from fse.models.sif import SIF
from fse.models.usif import uSIF
from fse.models.inputs import IndexedSentence

from re import sub
from scipy.stats import pearsonr

import re
from nltk import word_tokenize

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
                    level=logging.INFO
                   )

# This part is directly taken form the uSIF implementation
not_punc = re.compile('.*[A-Za-z0-9].*')

def preprocess(t):
    t = t.lower().strip("';.:()").strip('"')
    t = 'not' if t == "n't" else t
    return re.split(r'[-]', t)

def normalize_text(sentence):
    tokens = []
    
    for token in word_tokenize(sentence):
        if not_punc.match(token):
            tokens = tokens + preprocess(token)
    return tokens

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    return sims

file= "data/stsbenchmark/sts-test.csv"
similarities, sent_a, sent_b = [], [], []
with open(file, "r") as f:
    for l in f:
        line = l.rstrip().split("\t")
        similarities.append(float(line[4]))
        sent_a.append(normalize_text(line[5]))
        sent_b.append(normalize_text(line[6]))
similarities = np.array(similarities)
assert len(similarities) == len(sent_a) == len(sent_b)
task_length = len(similarities)
final_sents = sent_a + sent_b
sents = [IndexedSentence(s, i) for i,s in enumerate(final_sents)]

In [2]:
models, results = {}, {}

time: 625 µs


In [3]:
avg_glove = Average.load("data/vectors/glove.model")
models[f"CBOW-Glove"] = Average(avg_glove.wv, wv_mapfile_path="data/vectors/glove")
models[f"SIF-Glove"] = SIF(avg_glove.wv, components=15)
models[f"uSIF-Glove"] = uSIF(avg_glove.wv,length=11)

2019-08-20 16:32:52,869 : MainThread : INFO : loading Average object from data/vectors/glove.model
2019-08-20 16:32:57,377 : MainThread : INFO : loading wv recursively from data/vectors/glove.model.wv.* with mmap=None
2019-08-20 16:32:57,377 : MainThread : INFO : loading sv recursively from data/vectors/glove.model.sv.* with mmap=None
2019-08-20 16:32:57,378 : MainThread : INFO : loading prep recursively from data/vectors/glove.model.prep.* with mmap=None
2019-08-20 16:32:57,378 : MainThread : INFO : loaded data/vectors/glove.model
  "C extension not loaded, training/inferring will be slow. "
2019-08-20 16:32:57,395 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/glove_wv.vectors


time: 4.56 s


In [4]:
avg_w2v = Average.load("data/vectors/w2v.model")
models[f"CBOW-W2V"] = Average(avg_w2v.wv, wv_mapfile_path="data/vectors/w2v")
models[f"SIF-W2V"] = SIF(avg_w2v.wv, components=10)
models[f"uSIF-W2V"] = uSIF(avg_w2v.wv, length=11)

2019-08-20 16:32:57,439 : MainThread : INFO : loading Average object from data/vectors/w2v.model
2019-08-20 16:33:03,187 : MainThread : INFO : loading wv recursively from data/vectors/w2v.model.wv.* with mmap=None
2019-08-20 16:33:03,188 : MainThread : INFO : loading sv recursively from data/vectors/w2v.model.sv.* with mmap=None
2019-08-20 16:33:03,189 : MainThread : INFO : loading prep recursively from data/vectors/w2v.model.prep.* with mmap=None
2019-08-20 16:33:03,189 : MainThread : INFO : loaded data/vectors/w2v.model
  "C extension not loaded, training/inferring will be slow. "
2019-08-20 16:33:03,208 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v_wv.vectors


time: 5.8 s


In [5]:
avg_ft = Average.load("data/vectors/ft.model")
models[f"CBOW-FT"] = Average(avg_ft.wv, wv_mapfile_path="data/vectors/ft")
models[f"SIF-FT"] = SIF(avg_ft.wv, components=10)
models[f"uSIF-FT"] = uSIF(avg_ft.wv, length=11)

2019-08-20 16:33:03,247 : MainThread : INFO : loading Average object from data/vectors/ft.model
2019-08-20 16:33:08,000 : MainThread : INFO : loading wv recursively from data/vectors/ft.model.wv.* with mmap=None
2019-08-20 16:33:08,001 : MainThread : INFO : loading sv recursively from data/vectors/ft.model.sv.* with mmap=None
2019-08-20 16:33:08,002 : MainThread : INFO : loading prep recursively from data/vectors/ft.model.prep.* with mmap=None
2019-08-20 16:33:08,002 : MainThread : INFO : loaded data/vectors/ft.model
  "C extension not loaded, training/inferring will be slow. "
2019-08-20 16:33:08,020 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/ft_wv.vectors
2019-08-20 16:33:08,037 : MainThread : INFO : loading pre-existing vocab from data/vectors/ft_vocab.vectors
2019-08-20 16:33:08,054 : MainThread : INFO : loading pre-existing ngrams from data/vectors/ft_ng

time: 4.83 s


In [6]:
paranmt = KeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/paranmt.model")
models[f"CBOW-Paranmt"] = Average(paranmt, lang_freq="en")
models[f"SIF-Paranmt"] = SIF(paranmt, components=10, lang_freq="en")
models[f"uSIF-Paranmt"] = uSIF(paranmt, length=11, lang_freq="en")

2019-08-20 16:33:08,083 : MainThread : INFO : loading Word2VecKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/paranmt.model
2019-08-20 16:33:08,307 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/paranmt.model.vectors.npy with mmap=None
2019-08-20 16:33:08,366 : MainThread : INFO : setting ignored attribute vectors_norm to None
2019-08-20 16:33:08,367 : MainThread : INFO : loaded /Users/oliverborchers/Desktop/GSDEV/Models/Static/paranmt.model
  "C extension not loaded, training/inferring will be slow. "
2019-08-20 16:33:08,369 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2019-08-20 16:33:08,564 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2019-08-20 16:33:08,641 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en


time: 634 ms


In [7]:
for k, m in models.items():
    m_type  = k.split("-")[0]
    emb_type = k.split("-")[1]
    m.train(sents)
    r = pearsonr(similarities, compute_similarities(task_length, m))[0].round(4) * 100
    results[f"{m_type}-{emb_type}"] = r
    print(k, f"{r:2.2f}")

2019-08-20 16:33:08,724 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:08,734 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words
2019-08-20 16:33:09,498 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2195875 vocabulary: 2524 MB (2 GB)
2019-08-20 16:33:09,498 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:09,542 : MainThread : INFO : begin training
2019-08-20 16:33:09,660 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:09,661 : MainThread : INFO : training on 2758 effective sentences with 27351 effective words took 0s with 23275 sentences/s
2019-08-20 16:33:09,687 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:09,695 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words


CBOW-Glove 40.41


2019-08-20 16:33:10,461 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2195875 vocabulary: 2524 MB (2 GB)
2019-08-20 16:33:10,462 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:10,471 : MainThread : INFO : pre-computing SIF weights for 2195875 words
2019-08-20 16:33:12,827 : MainThread : INFO : begin training
2019-08-20 16:33:12,932 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:12,933 : MainThread : INFO : computing 15 principal components
2019-08-20 16:33:12,975 : MainThread : INFO : removing 15 principal components
2019-08-20 16:33:12,977 : MainThread : INFO : training on 2758 effective sentences with 27351 effective words took 0s with 25962 sentences/s
2019-08-20 16:33:13,004 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:13,012 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and

SIF-Glove 71.95


2019-08-20 16:33:13,784 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2195875 vocabulary: 2524 MB (2 GB)
2019-08-20 16:33:13,784 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:13,791 : MainThread : INFO : pre-computing uSIF weights for 2195875 words
2019-08-20 16:33:23,183 : MainThread : INFO : begin training
2019-08-20 16:33:23,311 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:23,312 : MainThread : INFO : computing 5 principal components
2019-08-20 16:33:23,351 : MainThread : INFO : training on 2758 effective sentences with 27351 effective words took 0s with 21480 sentences/s
2019-08-20 16:33:23,394 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:23,407 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words


uSIF-Glove 68.28


2019-08-20 16:33:24,566 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-20 16:33:24,567 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:24,621 : MainThread : INFO : begin training
2019-08-20 16:33:24,742 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:24,743 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 22659 sentences/s
2019-08-20 16:33:24,775 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:24,784 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words


CBOW-W2V 61.54


2019-08-20 16:33:25,899 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-20 16:33:25,900 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:25,909 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-20 16:33:29,158 : MainThread : INFO : begin training
2019-08-20 16:33:29,267 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:29,267 : MainThread : INFO : computing 10 principal components
2019-08-20 16:33:29,289 : MainThread : INFO : removing 10 principal components
2019-08-20 16:33:29,292 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 25232 sentences/s
2019-08-20 16:33:29,330 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:29,339 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and

SIF-W2V 71.12


2019-08-20 16:33:30,414 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-20 16:33:30,416 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:30,424 : MainThread : INFO : pre-computing uSIF weights for 3000000 words
2019-08-20 16:33:43,169 : MainThread : INFO : begin training
2019-08-20 16:33:43,279 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:43,279 : MainThread : INFO : computing 5 principal components
2019-08-20 16:33:43,312 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 24906 sentences/s
2019-08-20 16:33:43,341 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:43,350 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words


uSIF-W2V 67.46


2019-08-20 16:33:44,186 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)
2019-08-20 16:33:44,187 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:44,227 : MainThread : INFO : begin training
2019-08-20 16:33:44,528 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:44,529 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 9127 sentences/s
2019-08-20 16:33:44,556 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:44,565 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words


CBOW-FT 48.53


2019-08-20 16:33:45,403 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)
2019-08-20 16:33:45,404 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:45,413 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-20 16:33:47,824 : MainThread : INFO : begin training
2019-08-20 16:33:48,099 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:48,100 : MainThread : INFO : computing 10 principal components
2019-08-20 16:33:48,121 : MainThread : INFO : removing 10 principal components
2019-08-20 16:33:48,125 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 10018 sentences/s
2019-08-20 16:33:48,159 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:48,167 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and

SIF-FT 73.42


2019-08-20 16:33:49,034 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)
2019-08-20 16:33:49,036 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:49,045 : MainThread : INFO : pre-computing uSIF weights for 2000000 words
2019-08-20 16:33:57,685 : MainThread : INFO : begin training
2019-08-20 16:33:57,981 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:57,982 : MainThread : INFO : computing 5 principal components
2019-08-20 16:33:58,014 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 9297 sentences/s
2019-08-20 16:33:58,046 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-20 16:33:58,055 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words
2019-08-20 16:33:58,092 : MainThread : INFO : estimated memor

uSIF-FT 69.89
CBOW-Paranmt 79.85


2019-08-20 16:33:58,252 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words
2019-08-20 16:33:58,297 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 77224 vocabulary: 91 MB (0 GB)
2019-08-20 16:33:58,299 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-20 16:33:58,307 : MainThread : INFO : pre-computing SIF weights for 77224 words
2019-08-20 16:33:58,397 : MainThread : INFO : begin training
2019-08-20 16:33:58,500 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:58,501 : MainThread : INFO : computing 10 principal components
2019-08-20 16:33:58,520 : MainThread : INFO : removing 10 principal components
2019-08-20 16:33:58,522 : MainThread : INFO : training on 2758 effective sentences with 27441 effective words took 0s with 26568 sentences/s
2019-08-20 16:33:58,553 : MainThread : INFO : scanning all indexed sentences and their

SIF-Paranmt 76.73


2019-08-20 16:33:58,914 : MainThread : INFO : begin training
2019-08-20 16:33:59,016 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-20 16:33:59,017 : MainThread : INFO : computing 5 principal components
2019-08-20 16:33:59,043 : MainThread : INFO : training on 2758 effective sentences with 27441 effective words took 0s with 26684 sentences/s


uSIF-Paranmt 79.20
time: 50.4 s


In [8]:
pd.DataFrame.from_dict(results, orient="index", columns=["Pearson"])

Unnamed: 0,Pearson
CBOW-Glove,40.41
SIF-Glove,71.95
uSIF-Glove,68.28
CBOW-W2V,61.54
SIF-W2V,71.12
uSIF-W2V,67.46
CBOW-FT,48.53
SIF-FT,73.42
uSIF-FT,69.89
CBOW-Paranmt,79.85


time: 20.7 ms


# Additionally: Write glove vectors to KeyedVectors

# Read Paranmt Vectors from Disk