In [1]:
import sys
import time

import pandas as pd
from tqdm import tqdm

from wn import WordNet

from pywsd.utils import lemmatize
from pywsd.lesk import synset_signatures

wn = WordNet()

Warming up PyWSD (takes ~10 secs)... took 7.674057960510254 secs.


In [2]:
all_signatures = []

start = time.time()
for ss in tqdm(wn.all_synsets()):
    ss_signature = {}
    offset = ss.offset()
    pos = ss.pos()
    idx = str(offset).zfill(8) + '-' + pos
    
    ss_signature['simple'] = synset_signatures(ss, hyperhypo=True, adapted=False,
                                               remove_stopwords=True, 
                                               to_lemmatize=True, remove_numbers=True,
                                               lowercase=True)
    
    ss_signature['adapted'] = synset_signatures(ss, hyperhypo=True, adapted=True,
                                                   remove_stopwords=True, 
                                                   to_lemmatize=True, remove_numbers=True,
                                                   lowercase=True)
    
    ss_signature['original'] = synset_signatures(ss, original_lesk=True,
                                                 remove_stopwords=True, 
                                                 to_lemmatize=True, remove_numbers=True,
                                                 lowercase=True)
    
    
    all_signatures.append({'name': ss.name(), 'offset-pos': idx, 
                           'original': ss_signature['original'], 
                           'simple': ss_signature['simple'], 
                           'adapted':ss_signature['adapted']})

print('took {}'.format(time.time() - start), file=sys.stderr)

106966it [00:17, 5966.92it/s]
took 18.12508487701416


In [3]:
df = pd.DataFrame(all_signatures)
df.head()

Unnamed: 0,adapted,name,offset-pos,original,simple
0,"{follow, buy, grant, skill, something, get, kn...",able.a.01,00001740-a,"{do, ), skill, something, to, know-how, (, usu...","{follow, buy, grant, skill, something, get, kn..."
1,"{fund, follow, skill, get, know-how, usually, ...",unable.a.01,00002098-a,"{), not, skill, to, know-how, (, usually, or, ...","{fund, follow, skill, get, know-how, usually, ..."
2,"{underside, stem, face, organism, away, axis, ...",abaxial.a.01,00002312-a,"{organism, away, axis, organ, or, from, facing...","{underside, stem, face, organism, away, axis, ..."
3,"{upper, face, adaxial, organism, organ, axis, ...",adaxial.a.01,00002527-a,"{to, organism, organ, axis, or, facing, toward...","{upper, face, adaxial, organism, organ, axis, ..."
4,"{face, acroscopic, toward, side, apex}",acroscopic.a.01,00002730-a,"{on, or, facing, toward, side, apex, the}","{face, acroscopic, toward, side, apex}"


In [4]:
pywsd_signatures = df.set_index('name').T
pywsd_signatures.head()

name,able.a.01,unable.a.01,abaxial.a.01,adaxial.a.01,acroscopic.a.01,basiscopic.a.01,abducent.a.01,adducent.a.01,nascent.a.01,dying.a.01,...,overcast.v.01,overcloud.v.01,clear_up.v.04,blight.v.01,swamp.v.01,run_dry.v.01,fog_up.v.01,char.v.01,haze.v.01,deflagrate.v.01
adapted,"{follow, buy, grant, skill, something, get, kn...","{fund, follow, skill, get, know-how, usually, ...","{underside, stem, face, organism, away, axis, ...","{upper, face, adaxial, organism, organ, axis, ...","{face, acroscopic, toward, side, apex}","{face, toward, base, side, basiscopic}","{adjacent, draw, part, abduct, away, especiall...","{adjacent, draw, adduct, part, especially, tog...","{parturient, born, emergent, emerge, begin, in...","{civilization, process, life, man, die, wish, ...",...,"{beach, cloud, darken, fall, weather, overcast...","{sky, cloud, darken, overcloud, cloud_over, cl...","{sky, clear, light_up, clear_up, storm, bright...","{blight, much, plague, smite, may, suffer, aff...","{flood, boat, drench, harbor, tsunami, every, ...","{summer, empty, water, run, run_dry, river, dr...","{windshield, cloud, get, foggy, overcast, fog,...","{everything, forest, burn, combust, coal, dren...","{cloud, overcast, cloudy, haze, hazy, dull, be...","{great, must, burn, exercise, combust, deflagr..."
offset-pos,00001740-a,00002098-a,00002312-a,00002527-a,00002730-a,00002843-a,00002956-a,00003131-a,00003356-a,00003939-a,...,02770717-v,02771020-v,02771169-v,02771320-v,02771564-v,02771756-v,02771888-v,02771997-v,02772202-v,02772310-v
original,"{do, ), skill, something, to, know-how, (, usu...","{), not, skill, to, know-how, (, usually, or, ...","{organism, away, axis, organ, or, from, facing...","{to, organism, organ, axis, or, facing, toward...","{on, or, facing, toward, side, apex, the}","{on, or, facing, toward, base, side, the}","{adjacent, ;, part, especially, away, from, or...","{adjacent, drawing, ;, part, especially, or, m...","{being, beginning, or, born}","{be, ceasing, process, life, in, to, from, pas...",...,"{make, overcast, cloudy, or}","{with, covered, clouds, become}","{clear, become}","{blight, to, suffer, cause, a}","{submerged, be, or, drench, drenched, submerge}","{water, become, empty, of}","{get, foggy}","{charcoal, to, burn}","{cloudy, or, ,, hazy, dull, become}","{great, burn, to, and, intensity, cause, with,..."
simple,"{follow, buy, grant, skill, something, get, kn...","{fund, follow, skill, get, know-how, usually, ...","{underside, stem, face, organism, away, axis, ...","{upper, face, adaxial, organism, organ, axis, ...","{face, acroscopic, toward, side, apex}","{face, toward, base, side, basiscopic}","{adjacent, draw, part, abduct, away, especiall...","{adjacent, draw, adduct, part, especially, tog...","{born, begin, insurgency, nascent, chick}","{civilization, process, life, man, die, wish, ...",...,"{beach, cloud, darken, fall, weather, overcast...","{sky, cloud, darken, overcloud, cloud_over, cl...","{sky, clear, light_up, clear_up, storm, bright...","{blight, much, plague, smite, may, suffer, aff...","{flood, boat, drench, harbor, tsunami, every, ...","{summer, empty, water, run, run_dry, river, dr...","{windshield, cloud, get, foggy, overcast, fog,...","{everything, forest, burn, combust, coal, dren...","{cloud, overcast, cloudy, haze, hazy, dull, be...","{great, must, burn, exercise, combust, deflagr..."


In [5]:
pywsd_signatures.to_pickle('signatures-wordnet-3.0.pkl',protocol=2)