In [2]:
import time
import plwn
import numpy as np
import networkx as nx
import pandas as pd
from itertools import product
from data_loader import load_simlex_data
from wordnet_utils.graph_operations import get_graph_with_specified_relation, \
    delete_two_cycles_from_hiperonimia_graph
from wordnet_utils.plwn_utils import load_plwn_data, get_synsets_from_lemma
from wordnet_utils.similarities import WordNetSimilarities

In [3]:
simlex_data = load_simlex_data("data/MSimLex999_Polish.txt")
graph = nx.read_graphml("data/graph.gml")
wn, synsets = load_plwn_data()

In [4]:
only_hiperonyms = get_graph_with_specified_relation(graph, "hiperonimia")
list_of_cycles = list(nx.simple_cycles(only_hiperonyms))

In [5]:
i = 1
for cycle in list_of_cycles:
    print(f"{i} cycle")
    for vertex in cycle:
        print(vertex , [syn for syn in synsets if syn.id==int(vertex)][0])
    print()
    i+=1

1 cycle
2497 {zająć_się.1(29:cumy), poświęcić_się.1(29:cumy)}
2355 {zrobić.1(39:sp)}
44782 {zacząć.3(28:zmn), rozpocząć.1(28:zmn), zainicjować.1(28:zmn), wszcząć.1(28:zmn), jąć.1(28:zmn), począć.1(28:zmn)}
2373 {podjąć.1(39:sp), przedsięwziąć.1(29:cumy)}

2 cycle
2367 {podejmować.1(39:sp), przedsiębrać.1(28:zmn)}
2496 {zajmować_się.1(29:cumy), poświęcać_się.1(29:cumy)}
55305 {robić.1(39:sp)}



In [6]:
delete_two_cycles_from_hiperonimia_graph(only_hiperonyms)
wordnet_sim = WordNetSimilarities(only_hiperonyms)

Number of cycles before removal: 2
Number of cycles after removal: 0


In [6]:
for index, row in simlex_data.iterrows():
    word1 = row['word1']
    word2 = row['word2']
    synsets_for_1st_word = get_synsets_from_lemma(synsets, word1, use_dict=True)
    synsets_for_2nd_word = get_synsets_from_lemma(synsets, word2, use_dict=True)
    if synsets_for_1st_word is None:
        print(word1)
        print(synsets_for_1st_word)
    if synsets_for_2nd_word is None:
        print(word2)
        print(synsets_for_2nd_word)

obładowany
None
niedawny
None


In [9]:
metrics_dict = dict()

In [10]:
for index, row in simlex_data.iterrows():
    word1 = row['word1']
    word2 = row['word2']
    if metrics_dict.get((word1,word2)) is None:
        synset_for_1st_word = get_synsets_from_lemma(synsets, word1, use_dict=True)
        synset_for_2nd_word = get_synsets_from_lemma(synsets, word2, use_dict=True)
        if synset_for_1st_word is None or synset_for_2nd_word is None:
            print(f"One of the words {word1}, {word2} has not been found in Wordnet")
        else:
            first_word_syn_id = synset_for_1st_word.id
            second_word_syn_id = synset_for_2nd_word.id
            wu_p = wordnet_sim.wu_palmer(first_word_syn_id, second_word_syn_id)
            leacock_chodorow = wordnet_sim.leacock_chodrow(first_word_syn_id, second_word_syn_id)
            print(f"Wu-palmer similiarity for word:{word1} and word:{word2} is {wu_p}")
            print(f"Leacock-Chodorow similiarity for word:{word1} and word:{word2} is {leacock_chodorow}")

            metrics_dict[(word1,word2)] = dict()
            metrics_dict[(word1,word2)]["wu_palmer"] = wu_p
            metrics_dict[(word1,word2)]["leacock_chodorow"] = leacock_chodorow


Wu-palmer similiarity for word:stary and word:nowy is 0.3333333333333333
Leacock-Chodorow similiarity for word:stary and word:nowy is 1.2041199826559248
Wu-palmer similiarity for word:bystry and word:inteligentny is 0.75
Leacock-Chodorow similiarity for word:bystry and word:inteligentny is 1.505149978319906
Wu-palmer similiarity for word:ciężki and word:trudny is 0.2857142857142857
Leacock-Chodorow similiarity for word:ciężki and word:trudny is 1.1072099696478683
Wu-palmer similiarity for word:szczęśliwy and word:radosny is 0.25
Leacock-Chodorow similiarity for word:szczęśliwy and word:radosny is 1.0280287236002434
Wu-palmer similiarity for word:łatwy and word:męczący is 0.18181818181818182
Leacock-Chodorow similiarity for word:łatwy and word:męczący is 1.0280287236002434
Wu-palmer similiarity for word:szybki and word:gwałtowny is 0.25
Leacock-Chodorow similiarity for word:szybki and word:gwałtowny is 1.2041199826559248
Wu-palmer similiarity for word:szczęśliwy and word:zadowolony is 0

  path_length_between_nodes / self.double_max_depth_of_taxonomy)


In [53]:
simlex_data

Unnamed: 0_level_0,word1,word2,similarity,relatedness,wu_palmer,leacock_chodorow
nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,stary,nowy,0.43,7.29,0.333333,1.20412
2,bystry,inteligentny,8.86,9.71,0.75,1.50515
3,ciężki,trudny,4.86,7.29,0.285714,1.10721
4,szczęśliwy,radosny,8.14,8.86,0.25,1.02803
5,łatwy,męczący,0.43,6.43,0.181818,1.02803
...,...,...,...,...,...,...
995,dołączyć,zdobyć,0.43,2.29,0.166667,1.10721
996,wysyłać,uczestniczyć,0.00,0.86,0.2,1.20412
997,zbierać,uczestniczyć,0.00,0.71,0.2,1.32906
998,pochłonąć,wycofać,0.00,0.57,0.117647,0.961082


In [54]:
wu_palmers = []
leacock_chodorows = []
for index, row in simlex_data.iterrows():
    word1 = row['word1']
    word2 = row['word2']
    if metrics_dict.get((word1,word2)) is not None:
        metrics = metrics_dict[(word1,word2)]
        wu_palmers.append(metrics["wu_palmer"])
        leacock_chodorows.append(metrics["leacock_chodorow"])
    else:
        wu_palmers.append("N/A")
        leacock_chodorows.append("N/A")

In [55]:
simlex_data["wu_palmer"] = wu_palmers
simlex_data["leacock_chodorow"] = leacock_chodorows

In [56]:
simlex_data["leacock_chodorow"] = simlex_data["leacock_chodorow"].replace(np.inf,2.0)
simlex_data["leacock_chodorow"] = simlex_data["leacock_chodorow"].replace(-1,"N/A")
simlex_data["wu_palmer"] = simlex_data["wu_palmer"].replace(-1,"N/A")

In [57]:
simlex_data.to_csv("results_wordnet.csv")
