## Download everything we need

Ne need to download WordNet by means of NLTK.



In [1]:
import nltk
import pandas as pd
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pogre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Prepare the data

We import the data from a prepared text file. The file contains the set of word pairs (just nouns), for which expert similarity estimates are known.

We make an associative array of "word pair - similarity estimate".

In [10]:
import pandas as pd
data = pd.read_csv("Task_4_sample_4.csv")
data

Unnamed: 0,word_1,word_2,Score
0,professor,cucumber,0.31
1,monk,slave,0.92
2,psychology,discipline,5.58
3,life,death,7.88
4,announcement,production,3.38
...,...,...,...
145,energy,secretary,1.81
146,planet,moon,8.08
147,lobster,food,7.81
148,precedent,information,3.85


In [12]:
score_map = {(record[0],record[1]): record[2] for record in data.values.tolist()}
score_map

{('professor', 'cucumber'): 0.31,
 ('monk', 'slave'): 0.92,
 ('psychology', 'discipline'): 5.58,
 ('life', 'death'): 7.88,
 ('announcement', 'production'): 3.38,
 ('word', 'similarity'): 4.75,
 ('drink', 'car'): 3.04,
 ('precedent', 'group'): 1.77,
 ('tiger', 'cat'): 7.35,
 ('situation', 'isolation'): 3.88,
 ('dividend', 'payment'): 7.63,
 ('bird', 'cock'): 7.1,
 ('announcement', 'news'): 7.56,
 ('century', 'nation'): 3.16,
 ('cemetery', 'woodland'): 2.08,
 ('cup', 'article'): 2.4,
 ('fuck', 'sex'): 9.44,
 ('street', 'block'): 6.88,
 ('tiger', 'mammal'): 6.85,
 ('peace', 'insurance'): 2.94,
 ('smart', 'student'): 4.62,
 ('seafood', 'lobster'): 8.7,
 ('Harvard', 'Yale'): 8.13,
 ('architecture', 'century'): 3.78,
 ('peace', 'plan'): 4.75,
 ('stock', 'phone'): 1.62,
 ('president', 'medal'): 3.0,
 ('money', 'cash'): 9.15,
 ('morality', 'importance'): 3.31,
 ('Japanese', 'American'): 6.5,
 ('Arafat', 'Jackson'): 2.5,
 ('month', 'hotel'): 1.81,
 ('life', 'term'): 4.5,
 ('money', 'dollar'): 8

In [2]:
#with open("wordsim_similarity_goldstandard.txt", encoding="utf-8") as rf:
#  triples = [line.strip().split("\t") for line in rf.readlines()]
#  score_map = {tuple(triple[:2]): float(triple[2]) for triple in triples}

Note, that we took just expert similarity estimates from the original file and for nouns only. The original set is available [here](http://alfonseca.org/pubs/ws353simrel.tar.gz)

Let's have a look at similarity measure examples. 

Some words can have several different meanings in WordNet. Here -- just as an example -- we will select the first one that comes across, but then we will work with them differently.




In [14]:
for w1, w2 in list(score_map)[:2]:
  
  print("\nWords: %s-%s\nGround truth score: %.2f" % (w1, w2, score_map[(w1, w2)]))
  
  ss1 = wn.synset(w1 + ".n.01")
  ss2 = wn.synset(w2 + ".n.01")

  print("\nPath: %.3f" % ss1.path_similarity(ss2), end=" ")
  print("\nwup: %.3f" % ss1.wup_similarity(ss2), end=" ")
  print("\nshortest_path: %.3f" % ss1.shortest_path_distance(ss2))


Words: professor-cucumber
Ground truth score: 0.31

Path: 0.077 
wup: 0.500 
shortest_path: 12.000

Words: monk-slave
Ground truth score: 0.92

Path: 0.200 
wup: 0.667 
shortest_path: 4.000


Compute several similarity measures for all word pairs

In [15]:
from itertools import product

list_pairs = list(score_map)
wup_list, true_list, path_list, lch_list = [], [], [], []

# для всех пар
for w1, w2 in list_pairs:

  try:
    all_w1 = wn.synsets(w1, pos="n")
    all_w2 = wn.synsets(w2, pos="n")

    # we add metrics of interest and expert reviews
    wup = max([item1.wup_similarity(item2) \
                for item1, item2 in product(all_w1, all_w2)])
    wup_list.append(wup)

    path = max([item1.path_similarity(item2) \
                for item1, item2 in product(all_w1, all_w2)])
    path_list.append(path)

    lch = max([item1.lch_similarity(item2) \
                for item1, item2 in product(all_w1, all_w2)])
    lch_list.append(lch)

    true_list.append(score_map[(w1, w2)])

  except Exception as e:
    print(w1, w2, "error:", e)

## Calculate Spearman's rank correlation

In [16]:
from scipy.stats import spearmanr

coef, p = spearmanr(wup_list, true_list)
print("wup  Spearman R: %.4f" % coef)

coef, p = spearmanr(path_list, true_list)
print("path Spearman R: %.4f" % coef)

coef, p = spearmanr(lch_list, true_list)
print("lch Spearman R: %.4f" % coef)

wup  Spearman R: 0.6936
path Spearman R: 0.6535
дср Spearman R: 0.6535


In [21]:
len(wn.synsets("wood", pos="n")[0].hyponyms())

91

In [22]:
wn.synsets("wood", pos="n")[0].hyponyms()

[Synset('alder.n.01'),
 Synset('ash.n.03'),
 Synset('balsa.n.01'),
 Synset('bamboo.n.01'),
 Synset('basswood.n.01'),
 Synset('beech.n.02'),
 Synset('beefwood.n.02'),
 Synset('bentwood.n.01'),
 Synset('birch.n.01'),
 Synset('black_locust.n.01'),
 Synset('blackwood.n.01'),
 Synset('boxwood.n.01'),
 Synset('brazilwood.n.01'),
 Synset('briarwood.n.01'),
 Synset('brushwood.n.01'),
 Synset('burl.n.01'),
 Synset('cabinet_wood.n.01'),
 Synset('cedar.n.02'),
 Synset('cherry.n.01'),
 Synset('chestnut.n.01'),
 Synset('citronwood.n.01'),
 Synset('cocuswood.n.01'),
 Synset('cypress.n.01'),
 Synset('dogwood.n.02'),
 Synset('driftwood.n.01'),
 Synset('dyewood.n.01'),
 Synset('ebony.n.02'),
 Synset('elm.n.02'),
 Synset('eucalyptus.n.01'),
 Synset('fir.n.01'),
 Synset('fruitwood.n.01'),
 Synset('granadilla_wood.n.01'),
 Synset('guaiac_wood.n.01'),
 Synset('gumwood.n.01'),
 Synset('hardwood.n.01'),
 Synset('hazel.n.02'),
 Synset('heartwood.n.01'),
 Synset('hemlock.n.03'),
 Synset('hickory.n.01'),
 Synse