In [1]:
from word_mover_distance import model
import torchtext.vocab as vocab
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
from scipy.stats import kendalltau
import re
import matplotlib.pyplot as plt
from nltk import download
from nltk.corpus import stopwords
import torch



# Load data

In [2]:
path_corpus = "/Users/philippmetzger/OneDrive/PHILIPP/NOVA IMS/2nd Semester/06 Text Mining 4 ECTS/00 Project/corpus/"
cs_en = pd.read_csv(path_corpus + "cs-en/scores.csv")
de_en = pd.read_csv(path_corpus + "de-en/scores.csv")
en_fi = pd.read_csv(path_corpus + "en-fi/scores.csv")
en_zh = pd.read_csv(path_corpus + "en-zh/scores.csv")
ru_en = pd.read_csv(path_corpus + "ru-en/scores.csv")
zh_en = pd.read_csv(path_corpus + "zh-en/scores.csv")

In [3]:
descriptions = ["Russian into English", "German into English", "Czech into English", "Chinese into English", "English into Chinese", "English into Finish"]

rows = []
zscores = []
avgscores = []
annots = []

i = 0

for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    rows.append(element.shape[0])
    zscores.append(np.round(element["z-score"].mean(),2))
    avgscores.append(np.round(element["avg-score"].mean(), 2))
    annots.append(np.round(element["annotators"].mean(),2))
    i += 1                   
    
exploration_df = pd.DataFrame([rows, zscores, avgscores, annots]).T.rename(columns={0:"rows", 1:"avg z-score", 2:"avg avg-score", 3:"avg annotators"})
exploration_df["description"] = descriptions
exploration_df = exploration_df.set_index("description")
exploration_df

Unnamed: 0_level_0,rows,avg z-score,avg avg-score,avg annotators
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Russian into English,17980.0,0.01,74.5,1.3
German into English,21704.0,0.0,71.85,1.5
Czech into English,11585.0,-0.03,69.24,1.89
Chinese into English,26419.0,-0.05,66.06,1.42
English into Chinese,10221.0,-0.06,65.98,1.58
English into Finish,6748.0,-0.14,45.12,1.23


# Naive implementation inspired by Word Mover's distance

In [4]:
def get_word(word):
    return glove.vectors[glove.stoi[word]]

In [6]:
glove = vocab.GloVe(name='6B', dim=50)
print('Loaded {} words'.format(len(glove.itos)))

Loaded 400000 words


In [7]:
distances = []

for i in tqdm(range(de_en.shape[0])):
    
    over = False

    a = de_en.iloc[i,1]
    b = de_en.iloc[i,2]

    a = a.lower()
    a = re.sub("[^a-z]", ' ', a)
    a = a.split()

    b = b.lower()
    b = re.sub("[^a-z]", ' ', b)
    b = b.split()

    # Uncommenting the following two lines worsens the correlation
    #a = [word for word in a if word not in stop]
    #b = [word for word in b if word not in stop]

    len_pre_oov1 = len(a)
    len_pre_oov2 = len(b)
    a = [token for token in a if token in glove.itos]
    b = [token for token in b if token in glove.itos]
    diff1 = len_pre_oov1 - len(a)
    diff2 = len_pre_oov2 - len(b)

    if len(a) == 0 or len(b) == 0:
        #print('At least one of the documents had no words that were in the vocabulary. Aborting (returning inf).')
        dist = -1
        over = True

    if not over:
        dist = 0
        for word_a in a:
            distances_this = []
            for word_b in b:

                distances_this.append(torch.dist(get_word(word_a), get_word(word_b)))

            min_dist = min(distances_this)

            dist += min_dist
    
    distances.append(dist)

max_dist = max(distances)
distances = [max_dist if dist == -1 else dist for dist in distances]

  0%|          | 0/21704 [00:00<?, ?it/s]

In [8]:
pearsonr(de_en['avg-score'], distances)[0]

-0.1648361559689596

In [9]:
kendalltau(de_en['avg-score'], distances)[0]

-0.1396207216132029

# Word mover's distance
https://pypi.org/project/word-mover-distance/#description

## Convert the embedding model into a dictionary and pass it to word_mover_distance

In [10]:
glove_dict = {}

for i in range(len(glove.vectors)):
    glove_dict[glove.itos[i]] = glove.vectors[i].numpy()

In [11]:
my_model = model.WordEmbedding(model=glove_dict)

## Test word_mover_distance

In [12]:
s1 = 'Obama speaks to the media in Chicago'.lower().split()
s2 = 'The president spoke to the press in Chicago'.lower().split()
wmdistance = my_model.wmdistance(s1, s2)
wmdistance

1.8119693993679309

1.8119693993679309

## Use word_mover_distance on our data

### Version 1: With stopword removal

In [13]:
stop = stopwords.words('english')

In [14]:
data = de_en

wmdistances = []

for row in tqdm(range(data.shape[0])):
    
    s1 = data.iloc[row,1]
    s2 = data.iloc[row,2]
    
    s1 = s1.lower()
    s1 = re.sub("[^a-z]", ' ', s1)
    s1 = s1.split()
    
    s2 = s2.lower()
    s2 = re.sub("[^a-z]", ' ', s2)
    s2 = s2.split()
    
    s1 = [word for word in s1 if word not in stop]
    s2 = [word for word in s2 if word not in stop]
    
    wmdistance = my_model.wmdistance(s1, s2)
    wmdistances.append(wmdistance)

  0%|          | 0/21704 [00:00<?, ?it/s]

#### Replace inf values by maximum

In [15]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]
max_dist = max(wmdistances2)
wmdistances2 = [max_dist if dist == -1 else dist for dist in wmdistances2]

In [16]:
pearsonr(de_en['avg-score'], wmdistances2)[0]

-0.27926436711081465

In [17]:
kendalltau(de_en['avg-score'], wmdistances2)[0]

-0.20792525828577252

#### Replace inf values by maximum/2

In [18]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]

wmdistances2 = [(max_dist/2) if dist == -1 else dist for dist in wmdistances2]

In [19]:
pearsonr(de_en['avg-score'], wmdistances2)[0]

-0.2931366960034692

In [20]:
kendalltau(de_en['avg-score'], wmdistances2)[0]

-0.2081883157171871

### Version 2: Without stopword removal

In [21]:
data = de_en

wmdistances = []

for row in tqdm(range(data.shape[0])):
    
    s1 = data.iloc[row,1]
    s2 = data.iloc[row,2]
    
    s1 = s1.lower()
    s1 = re.sub("[^a-z]", ' ', s1)
    s1 = s1.split()
    
    s2 = s2.lower()
    s2 = re.sub("[^a-z]", ' ', s2)
    s2 = s2.split()
    
    wmdistance = my_model.wmdistance(s1, s2)
    wmdistances.append(wmdistance)

  0%|          | 0/21704 [00:00<?, ?it/s]

#### Replace inf values by maximum

In [22]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]
max_dist = max(wmdistances2)
wmdistances2 = [max_dist if dist == -1 else dist for dist in wmdistances2]

In [23]:
pearsonr(de_en['avg-score'], wmdistances2)[0]

-0.30853318447232847

In [24]:
kendalltau(de_en['avg-score'], wmdistances2)[0]

-0.2168328261851982

#### Replace inf values by maximum/2

In [25]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]

wmdistances2 = [(max_dist/2) if dist == -1 else dist for dist in wmdistances2]

In [26]:
pearsonr(de_en['avg-score'], wmdistances2)[0]

-0.30882340005334613

In [27]:
kendalltau(de_en['avg-score'], wmdistances2)[0]

-0.2168229814123602