In [39]:
from word_mover_distance import model
import torchtext.vocab as vocab
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
from scipy.stats import kendalltau
import re
import matplotlib.pyplot as plt
from nltk import download
from nltk.corpus import stopwords
import torch

# Load data

In [2]:
path_corpus = "/Users/philippmetzger/OneDrive/PHILIPP/NOVA IMS/2nd Semester/06 Text Mining 4 ECTS/00 Project/corpus/"
cs_en = pd.read_csv(path_corpus + "cs-en/scores.csv")
de_en = pd.read_csv(path_corpus + "de-en/scores.csv")
en_fi = pd.read_csv(path_corpus + "en-fi/scores.csv")
en_zh = pd.read_csv(path_corpus + "en-zh/scores.csv")
ru_en = pd.read_csv(path_corpus + "ru-en/scores.csv")
zh_en = pd.read_csv(path_corpus + "zh-en/scores.csv")

In [None]:
descriptions = ["Russian into English", "German into English", "Czech into English", "Chinese into English", "English into Chinese", "English into Finish"]

rows = []
zscores = []
avgscores = []
annots = []

i = 0

for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    rows.append(element.shape[0])
    zscores.append(np.round(element["z-score"].mean(),2))
    avgscores.append(np.round(element["avg-score"].mean(), 2))
    annots.append(np.round(element["annotators"].mean(),2))
    i += 1                   
    
exploration_df = pd.DataFrame([rows, zscores, avgscores, annots]).T.rename(columns={0:"rows", 1:"avg z-score", 2:"avg avg-score", 3:"avg annotators"})
exploration_df["description"] = descriptions
exploration_df = exploration_df.set_index("description")
exploration_df

# Naive implementation inspired by Word Mover's distance

In [41]:
def get_word(word):
    return glove.vectors[glove.stoi[word]]

In [None]:
distances = []

for i in tqdm(range(de_en.shape[0])):
    
    over = False

    a = de_en.iloc[i,1]
    b = de_en.iloc[i,2]

    a = a.lower()
    a = re.sub("[^a-z]", ' ', a)
    a = a.split()

    b = b.lower()
    b = re.sub("[^a-z]", ' ', b)
    b = b.split()

    #a = [word for word in a if word not in stop]
    #b = [word for word in b if word not in stop]

    len_pre_oov1 = len(a)
    len_pre_oov2 = len(b)
    a = [token for token in a if token in glove.itos]
    b = [token for token in b if token in glove.itos]
    diff1 = len_pre_oov1 - len(a)
    diff2 = len_pre_oov2 - len(b)

    if len(a) == 0 or len(b) == 0:
        #print('At least one of the documents had no words that were in the vocabulary. Aborting (returning inf).')
        dist = -1
        over = True

    if not over:
        dist = 0
        for word_a in a:
            distances_this = []
            for word_b in b:

                distances_this.append(torch.dist(get_word(word_a), get_word(word_b)))

            min_dist = min(distances_this)

            dist += min_dist
    
    distances.append(dist)

max_dist = max(distances)
distances = [max_dist if dist == -1 else dist for dist in distances]

  0%|          | 0/21704 [00:00<?, ?it/s]

In [None]:
pearsonr(de_en['avg-score'], distances)[0]

In [None]:
kendalltau(de_en['avg-score'], distances)[0]

# Word mover's distance
https://pypi.org/project/word-mover-distance/#description

## Load embedded model, convert it to a dictionary and pass it to word_mover_distance

In [3]:
glove = vocab.GloVe(name='6B', dim=50)
print('Loaded {} words'.format(len(glove.itos)))

Loaded 400000 words


In [4]:
glove_dict = {}

for i in range(len(glove.vectors)):
    glove_dict[glove.itos[i]] = glove.vectors[i].numpy()

In [5]:
my_model = model.WordEmbedding(model=glove_dict)

## Test word_mover_distance

In [6]:
s1 = 'Obama speaks to the media in Chicago'.lower().split()
s2 = 'The president spoke to the press in Chicago'.lower().split()
wmdistance = my_model.wmdistance(s1, s2)
wmdistance

1.8119693993679309

1.8119693993679309

## Use word_mover_distance on our data

### Version 1: With stopword removal

In [30]:
stop = stopwords.words('english')

In [31]:
data = de_en

wmdistances = []

for row in tqdm(range(data.shape[0])):
    
    s1 = data.iloc[row,1]
    s2 = data.iloc[row,2]
    
    s1 = s1.lower()
    s1 = re.sub("[^a-z]", ' ', s1)
    s1 = s1.split()
    
    s2 = s2.lower()
    s2 = re.sub("[^a-z]", ' ', s2)
    s2 = s2.split()
    
    s1 = [word for word in s1 if word not in stop]
    s2 = [word for word in s2 if word not in stop]
    
    wmdistance = my_model.wmdistance(s1, s2)
    wmdistances.append(wmdistance)

  0%|          | 0/21704 [00:00<?, ?it/s]

#### Replace inf values by maximum

In [32]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]
max_dist = max(wmdistances2)
wmdistances2 = [max_dist if dist == -1 else dist for dist in wmdistances2]

In [33]:
pearsonr(de_en['avg-score'], wmdistances2)

(-0.27926436711081465, 0.0)

In [34]:
kendalltau(de_en['avg-score'], wmdistances2)

KendalltauResult(correlation=-0.20792525828577252, pvalue=0.0)

#### Replace inf values by maximum/2

In [35]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]

wmdistances2 = [(max_dist/2) if dist == -1 else dist for dist in wmdistances2]

In [36]:
pearsonr(de_en['avg-score'], wmdistances2)

(-0.2931366960034692, 0.0)

In [37]:
kendalltau(de_en['avg-score'], wmdistances2)

KendalltauResult(correlation=-0.2081883157171871, pvalue=0.0)

### Version 2: Without stopword removal

In [7]:
data = de_en

wmdistances = []

for row in tqdm(range(data.shape[0])):
    
    s1 = data.iloc[row,1]
    s2 = data.iloc[row,2]
    
    s1 = s1.lower()
    s1 = re.sub("[^a-z]", ' ', s1)
    s1 = s1.split()
    
    s2 = s2.lower()
    s2 = re.sub("[^a-z]", ' ', s2)
    s2 = s2.split()
    
    wmdistance = my_model.wmdistance(s1, s2)
    wmdistances.append(wmdistance)

  0%|          | 0/21704 [00:00<?, ?it/s]

#### Replace inf values by maximum

In [8]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]
max_dist = max(wmdistances2)
wmdistances2 = [max_dist if dist == -1 else dist for dist in wmdistances2]

In [9]:
pearsonr(de_en['avg-score'], wmdistances2)

(-0.30853318447232847, 0.0)

In [10]:
kendalltau(de_en['avg-score'], wmdistances2)

KendalltauResult(correlation=-0.2168328261851982, pvalue=0.0)

#### Replace inf values by maximum/2

In [11]:
wmdistances2 = [-1 if dist == np.inf else dist for dist in wmdistances]

wmdistances2 = [(max_dist/2) if dist == -1 else dist for dist in wmdistances2]

In [12]:
pearsonr(de_en['avg-score'], wmdistances2)

(-0.30882340005334613, 0.0)

In [13]:
kendalltau(de_en['avg-score'], wmdistances2)

KendalltauResult(correlation=-0.2168229814123602, pvalue=0.0)