In [None]:
!pip install git+https://github.com/AIPHES/emnlp19-moverscore.git

In [None]:
!pip install pyemd==0.5.1

In [None]:
# Needed for `moverscore`, not for `moverscore_v2`:

# !pip install pytorch_pretrained_bert
# !pip install sentencepiece

In [None]:
!pip install transformers

In [None]:
!pip install sacremoses

In [None]:
# %env MOVERSCORE_MODEL=bert-base-multilingual-uncased 
# %env MOVERSCORE_MODEL=microsoft/deberta-v3-large
%env MOVERSCORE_MODEL=xlm-roberta-large  # the same as was used in BERTScore for German

from moverscore_v2 import word_mover_score, get_idf_dict
import os 

# model_name = 'bert-base-multilingual-uncased'
# model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
model_name = 'xlm-roberta-large'

os.environ['MOVERSCORE_MODEL'] = model_name 

In [None]:
from collections import defaultdict

# No idf weights (seems to work better)
idf_dict_hyp = defaultdict(lambda: 1.)
idf_dict_ref = defaultdict(lambda: 1.)

# idf weights
# idf_dict_hyp = get_idf_dict(hyps_snts) 
# idf_dict_ref = get_idf_dict(ref_snts)  

In [None]:
from nltk.corpus import stopwords
from nltk import download

download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english') # 'german'

### Load WMT15 data

In [None]:
data_type = 'wmt15'
with open("/content/mt.txt") as f:
    hyp_snts = [line[:-1] for line in f] # remove last symbol which is '\n'

with open("/content/reference.txt") as f:
    ref_snts = [line[:-1] for line in f]

with open("/content/newstest2015.human.de-en") as f:
    human_scores = [float(line[:-1]) for line in f] 

### Load WMT21 data

In [None]:
# import pickle
# data_type = 'wmt21'
# with open('/content/all_ref_snts_21.pickle', 'rb') as fp:
#     ref_snts = pickle.load(fp)

# with open('/content/all_mt_snts_21.pickle', 'rb') as fp:
#     hyp_snts = pickle.load(fp)

# with open('/content/all_z_mqm_scores.pickle', 'rb') as fp:
#     human_scores = pickle.load(fp)

# with open('/content/all_src_snts_21.pickle', 'rb') as fp:
#     src_snts = pickle.load(fp)

In [None]:
import os 
# os.environ['MOVERSCORE_MODEL'] = 'bert-base-multilingual-uncased'

# model_name = 'bert-base-multilingual-uncased'
model_name = 'xlm-roberta-large'
# model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"

os.environ['MOVERSCORE_MODEL'] = model_name 

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
from collections import defaultdict
# idf_dict_hyp = get_idf_dict(hyps_snts) # 
# idf_dict_ref = get_idf_dict(ref_snts) # 
idf_dict_hyp = defaultdict(lambda: 1.)
idf_dict_ref = defaultdict(lambda: 1.)

In [None]:
mover_scores = []
total = len(ref_snts)
step = 256
l = len(mover_scores)
for i in range(l, total, step):
    next_part = i+step
    print(f"----- {i} out of {total} steps -----")
    print(next_part)
    scores = word_mover_score(ref_snts[i:next_part], hyps_snts[i:next_part], idf_dict_ref, idf_dict_hyp, stop_words=[...], n_gram=1, remove_subwords=False, batch_size=step)
    mover_scores = mover_scores + scores

In [None]:
curr_len = len(mover_scores)
with open(f'mover_scores_{data_type}_{model_name}.pickle', 'wb') as f:
    pickle.dump(mover_scores, f, pickle.HIGHEST_PROTOCOL)

---
### Exploratory analysis

In [None]:
def preprocess(s):
  return [s,]

In [None]:
l = []

s_obama = 'Obama speaks to the media in Illinois'
sentence_obama = preprocess(s_obama)

s1_obama = 'Obama meets the media in Illinois'
sentence_obama3 = preprocess(s1_obama)
distance3 = word_mover_score(sentence_obama, sentence_obama3, idf_dict_ref, idf_dict_hyp, \
                          stop_words=stop_words, n_gram=1, remove_subwords=False, batch_size=256)[0]
print("Ref:", sentence_obama, "Hyp:", sentence_obama3, "WMD score:", round(distance3, 4))
l.append([s_obama, s1_obama, distance3])

s1_obama = ('Obama speaks to the media')
sentence_obama4 = preprocess('Obama speaks to the media')
distance4 = word_mover_score(sentence_obama, sentence_obama4, idf_dict_ref, idf_dict_hyp, \
                          stop_words=stop_words, n_gram=1, remove_subwords=False, batch_size=256)[0]
print(sentence_obama, sentence_obama4, distance4)
l.append([s_obama, s1_obama, distance4])

s1_obama = 'in Illinois Obama to the media speaks'
sentence_obama4 = preprocess(s1_obama)
distance4 = word_mover_score(sentence_obama, sentence_obama4, idf_dict_ref, idf_dict_hyp, \
                          stop_words=stop_words, n_gram=1, remove_subwords=False, batch_size=256)[0]
print(sentence_obama, sentence_obama4, distance4)
l.append([s_obama, s1_obama, distance4])

s1_obama = 'He speaks to the media in Illinois'
sentence_obama4 = preprocess(s1_obama)
distance4 = word_mover_score(sentence_obama, sentence_obama4, idf_dict_ref, idf_dict_hyp, \
                          stop_words=stop_words, n_gram=1, remove_subwords=False, batch_size=256)[0]
print(sentence_obama, sentence_obama4, distance4)
l.append([s_obama, s1_obama, distance4])

s1_obama = "Obama hates the media in Illinois"
sentence_obama4 = preprocess(s1_obama)
distance4 = word_mover_score(sentence_obama, sentence_obama4, idf_dict_ref, idf_dict_hyp, \
                          stop_words=stop_words, n_gram=1, remove_subwords=False, batch_size=256)[0]
print(sentence_obama, sentence_obama4, distance4)
l.append([s_obama, s1_obama, distance4])

s1 = preprocess('speaks')
s2 = preprocess('hates')
d = word_mover_score(s1, s2, idf_dict_ref, idf_dict_hyp, \
                          stop_words=stop_words, n_gram=1, remove_subwords=False, batch_size=256)[0]
# print(s1, s2, d)
l.append(['speaks', 'hates', d])


for i, (x, y, z) in enumerate(l):
    print()
    print("{}. Ref: {:<42} Hyp: {:<42} MoverScore: {}".format(i+1, str(x), str(y), round(z, 4)))

In [None]:
# import these modules
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
print("spoken :", lemmatizer.lemmatize("spoken", pos="v"))

# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos ="a"))

In [None]:
# Example from RegEMT paper

ref = "I never wrote this article, I just edited it."
hyp1 = "It is not my article, I just edited it."
hyp2 = "I never wrote this article, I never edited it."

scores = word_mover_score([ref, ref], [hyp1, hyp2], idf_dict_ref, idf_dict_hyp, \
                          stop_words=[...], n_gram=1, remove_subwords=False, batch_size=256)
scores

In [None]:
from moverscore_v2 import plot_example

hyp1_wmt15 = 'The Ministry of Education said, about a dozen families is not yet returned.'
ref1_wmt15 = 'The Education Ministry said about a dozen families still had not returned.'
plot_example(True, ref1_wmt15, hyp1_wmt15)

In [None]:
ref2 = 'I am afraid of you a lot'
hyp2 = 'I have a big fear of you'
plot_example(True, ref2, hyp2)

### Experiment with sentence embeddings

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.',
    'I know',
    "I don't know"]
sentence_embeddings = model.encode(sentences)
sentence_embeddings

### Experiment with distance measure

In [None]:
def pairwise_distances(x, y=None):
    x_norm = (x**2).sum(1).view(-1, 1)
    y_norm = (y**2).sum(1).view(1, -1)
    y_t = torch.transpose(y, 0, 1)
    dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)    
    return torch.clamp(dist, 0.0, np.inf)

In [None]:
import torch
from scipy import spatial

In [None]:
embeddings = model.encode(["I will do it", "I won't do it"])
1 - spatial.distance.cosine(embeddings[0], embeddings[1])

In [None]:
embeddings = model.encode(["I just edited it", "I never edited it"])
1 - spatial.distance.cosine(embeddings[0], embeddings[1])

In [None]:
ref = 'I never wrote this article, I just edited it.'
hyp = 'I never wrote this article, I never edited it.'
embeddings = model.encode([ref, hyp])
1 - spatial.distance.cosine(embeddings[0], embeddings[1])

In [None]:
ref = 'I never wrote this article.'
hyp = 'I never wrote this article.'
embeddings = model.encode([ref, hyp])
1 - spatial.distance.cosine(embeddings[0], embeddings[1])