In [None]:
import numpy as np
from scipy.sparse import csr_matrix
import os
import subprocess


def load_cooccurrences(path, filt_path, indices):
  """ Usage: load_cooccurrences("cooccurrence.bin", [1, 2, 3]) """
  if not os.path.isfile(filt_path):
    with open(filt_path + ".tmp", "wb") as f:
      print("calling filter subproces...")
      args = ["./filt-cooccur.sh", path] + [str(i) for i in indices]
      subprocess.run(args, stdout=f, check=True)
      print("subprocess exited normally.")

    os.rename(filt_path + ".tmp", filt_path)

  dt = np.dtype([('i', '<i4'), ('j', '<i4'), ('x', '<f8')])
  arr = np.fromfile(filt_path, dtype=dt)
  return csr_matrix((arr['x'], (arr['i']-1, arr['j']-1)))


def load_vocab(path):
  """
  Usage: load_vocab("vocab.txt")

  Returns a list of tuples of (word: str, freq: int)
  """
  with open(path, "r") as f:
    res = []
    for line in f:
      word, freq = line.split(' ')
      res.append((word, int(freq)))
  return res


def load_vectors(path, vector_size, vocab_size):
  """
  Usage: load_vectors("vectors.bin")

  Returns (word_vectors, context_vectors, word_biases, context_biases).

  word_vectors and context_vectors are (vocab_size, vector_size) matrices
  word_biases and context_biases are (vocab_size) arrays
  """
  dt = np.dtype('<f8')
  arr = np.fromfile(path, dtype=dt)
  vecs = arr.reshape((2*vocab_size, vector_size+1))
  word_mat, ctx_mat = np.split(vecs, 2)
  word, ctx = word_mat[:, :vector_size], ctx_mat[:, :vector_size]
  bias_word, bias_ctx = word_mat[:, vector_size], ctx_mat[:, vector_size]
  return word, ctx, bias_word, bias_ctx


home = os.path.expanduser('~')

# Load embeddings: these are trained with the full "Wikipedia" configuration
cooccur1_path = "../GloVe/wikipedia/paper/cooccurrence.bin"
cooccur2_path = "../GloVe/wikipedia/paper/cooccurrence2.bin"
cooccur1_filt_path = "../GloVe/wikipedia/paper/cooccurrence.filt.bin"
cooccur2_filt_path = "../GloVe/wikipedia/paper/cooccurrence2.filt.bin"
vocab1_path = "../GloVe/wikipedia/paper/vocab.txt"
vocab2_path = "../GloVe/wikipedia/paper/vocab2.txt"
vector1_path = "../GloVe/wikipedia/paper/vectors.bin"
vector2_path = "../GloVe/wikipedia/paper/vectors2.bin"
vector_size = 100

print("Loading vocab...")
vocab1 = load_vocab(vocab1_path)
vocab2 = load_vocab(vocab2_path)
dictionary1 = [v[0] for v in vocab1]
dictionary2 = [v[0] for v in vocab2]
D1 = len(dictionary1)
D2 = len(dictionary2)

print("Load vecs1...")
vecs1 = load_vectors(vector1_path, vector_size, D1)
word1, ctx1, B1, B_ctx1 = vecs1
print("Load vecs2...")
vecs2 = load_vectors(vector2_path, vector_size, D2)
word2, ctx2, B2, B_ctx2 = vecs2

print("Done.")


In [None]:
with open("sequences.txt", "r") as f:
  s_pairs = []
  for line in f:
    line = line.strip()
    if not line:
      continue
    s0 = line.index(" ")
    s1 = line.find(" ", s0+1)
    if s1 == -1 and (not s_pairs or line != s_pairs[-1]):
      s_pairs.append(line)
s_pairs = [s.split(" ") for s in s_pairs]
print(len(s_pairs))
print(s_pairs)

# Note: since GloVe-paper limits the vocabulary to the 400k most common words,
# it's possible for the attacker to select a pair that was not part of the
# victim's vocabulary. In our tests this only happened once.
pairs1 = []
pairs2 = []
for s, t in s_pairs:
  try:
    pairs1.append((dictionary1.index(s), dictionary1.index(t)))
    pairs2.append((dictionary2.index(s), dictionary2.index(t)))
  except ValueError:
    print("WARNING: Couldn't find pair", s, t)

print(len(pairs1))
print(pairs1)
print(pairs2)


In [None]:
print("Loading cooccurrence1...")
C1 = load_cooccurrences(cooccur1_path, cooccur1_filt_path, [i for p in pairs1 for i in p])
Csum1 = C1.sum(axis=1).A1
print("Loading cooccurrence2...")
C2 = load_cooccurrences(cooccur2_path, cooccur2_filt_path, [i for p in pairs2 for i in p])
Csum2 = C2.sum(axis=1).A1


In [None]:
from typing import List
from math import log, inf, sqrt, exp
from collections import defaultdict
import pandas as pd


def cos_sim(s, t, C, Csum, B, word, ctx, num):
  """ Cosine similarity """
  es = word[s]+ctx[s]
  et = word[t]+ctx[t]
  return es.dot(et)/(np.linalg.norm(es)*np.linalg.norm(et))


def model_f(u: int, v: int, c: float, epsilon: float, B: np.ndarray) -> float:
  """ Optimized f() """
  logc = log(c) if c > 0 else -inf
  return max(logc-B[u]-B[v], epsilon)


def M_row(u, C, B):
  """ Calculate a sparse row of M """
  C_row = C.getrow(u)
  cols = C_row.nonzero()[1]
  vals = np.fromiter((model_f(u, v, C[u, v], 0, B)
                     for v in cols), dtype=np.float64)
  return csr_matrix((vals, C_row.indices, C_row.indptr), C_row.shape)


def sim1(s, t, C, Csum, B, word, ctx, num):
  num = model_f(s, t, C[s, t], 0, B)
  den1 = model_f(s, t, Csum[s], exp(-60), B)
  den2 = model_f(s, t, Csum[t], exp(-60), B)
  return num/sqrt(den1*den2)


# sim2 calculation is a bit costly, and it's repeated in sim1+2. memoize it
sim2_cache = {}


def sim2(s, t, C, Csum, B, word, ctx, num):
  if (s, t, num) not in sim2_cache:
    Ms = M_row(s, C, B)
    Mt = M_row(t, C, B)
    dot = (Ms @ Mt.transpose())[0, 0]
    Ms_norm2 = Ms.power(2).sum()
    Mt_norm2 = Mt.power(2).sum()
    sim2_cache[s, t, num] = dot/sqrt(Ms_norm2*Mt_norm2)
  return sim2_cache[s, t, num]


def sim12(*args):
  return (sim1(*args)+sim2(*args))/2


measures = [("cos", cos_sim), ("sim1", sim1),
            ("sim2", sim2), ("sim1+2", sim12)]
data = defaultdict(list)

for (s1, t1), (s2, t2) in zip(pairs1, pairs2):
  print("Source:", dictionary1[s1], "Target:", dictionary1[t1])
  for desc, func in [("cos", cos_sim), ("sim1", sim1), ("sim2", sim2), ("sim1+2", sim12)]:
    val1 = func(s1, t1, C1, Csum1, B1, word1, ctx1, 1)
    val2 = func(s2, t2, C2, Csum2, B2, word2, ctx2, 2)
    data[desc+"_pre"].append(val1)
    data[desc+"_post"].append(val2)
    print(desc+":", val1, "->", val2, "DIFF:", val2-val1)
  print()

df = pd.DataFrame(data)
df


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

# Plot the increase in distributional proximity vs. increase in embedding
# proximity, similar to Figure A.2(c) in the original paper
x = df["sim1+2_post"]-df["sim1+2_pre"]
y = df["cos_post"]-df["cos_pre"]
plt.scatter(x, y)
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
print(np.corrcoef(x, y))

In [None]:
def get_coss(u, evec, norms):
  """ Compute the cosine similarity for each word relative to the given target """
  dots = evec @ evec[u].transpose()
  return dots / (norms[u]*norms)


def top_ranked(u, n, evec, norms):
  """ Find the top n highest-ranked words relative to the given target """
  coss = get_coss(u, evec, norms)
  res = np.argpartition(coss, -(n+1))[-(n+1):]
  return res[np.argsort(coss[res])][:-1][::-1]


def top_ranked_str(u, n, evec, norms, dictionary):
  return [dictionary[i] for i in top_ranked(u, n, evec, norms)]


def get_rank(s, t, evec, norms):
  """ Calculate the rank of the source word relative to the target word """
  coss = get_coss(t, evec, norms)
  return np.sum(coss > coss[s])

# compute the "e" vector for each word
evec1 = word1+ctx1
evec2 = word2+ctx2
# precompute the norm of each word's "e" vector (used to find cosine similarity)
norms1 = np.linalg.norm(evec1, axis=1)
norms2 = np.linalg.norm(evec2, axis=1)
ranks = []

# Print the top 10 highest-ranked words for the target, before and after retraining
for (s1, t1), (s2, t2) in zip(pairs1, pairs2):
  rank1 = get_rank(s1, t1, evec1, norms1)
  rank2 = get_rank(s2, t2, evec2, norms2)
  ranks.append((rank1, rank2))
  print("Source:", dictionary1[s1], "Target:", dictionary1[t1])
  print("Rank:", rank1, "->", rank2)
  print("Pre: \n" + "\n".join(top_ranked_str(t1, 10, word1+ctx1, norms1, dictionary1)))
  print()
  print("Post:\n" + "\n".join(top_ranked_str(t2, 10, word2+ctx2, norms2, dictionary2)))
  print()

rdf = pd.DataFrame(ranks, columns=["rank_pre", "rank_post"])
rdf

In [None]:
print("Median rank:")
print("Pre: ", rdf["rank_pre"].median())
print("Post:", rdf["rank_post"].median())
print()
print("Number of pairs with source word ranked < 10:")
lt_10 = rdf[rdf < 10]
print("Pre: ", lt_10["rank_pre"].count())
print("Post:", lt_10["rank_post"].count())