# lexsub:

## Retrofitting Script

In [None]:
import math
import numpy
import re
import sys
from pymagnitude import *
from copy import deepcopy


def print_output(word_vecs, outFile):
    w_file = open(outFile,'w')
    for word in wordvecs:
        w_file.write(word)
        for vec in word_vecs[word]:
            w_file.write(' '+str(vec))
        w_file.write('\n')
    w_file.close()

def read_wordvecs(filename):
    wordvecs = {}
    wv = Magnitude(filename)
    
    for word in wv:
        wordvecs[word[0]] = word[1]
    return wordvecs 

def lex_dict(filename):
        lexicon = {}
        for line in open(filename,'r'):
            word_list = line.lower().strip().split()
            lexicon[word_list[0]] = [word for word in word_list[1:]]
        return lexicon


def retrofit(wordvecs, lexicon, iter):
    new_vecs = deepcopy(wordvecs)
    vocab = set(new_vecs.keys())
    common_vocab = vocab.intersection(set(lexicon.keys()))
    for i in range(iter):
        for word in common_vocab:
            w_neighbours = set(lexicon[word]).intersection(vocab)
            n_neighbours = len(w_neighbours)
            if n_neighbours > 0:
                new_vector = n_neighbours * wordvecs[word]
                for pword in w_neighbours:
                    new_vector = new_vector + new_vecs[pword]
                new_vecs[word] = new_vector/(2*n_neighbours)
    return new_vecs


    
wordvecs = read_wordvecs('data/glove.6B.100d.magnitude')
lexicon = lex_dict('data/lexicons/wordnet-synonyms.txt')
print_output(retrofit(wordvecs,lexicon,10),'data/glove.6B.100d.retrofit.txt')


In [2]:
from lexsub import *
import os

## Run the solution on dev (with retrofitted embeddings)

In [5]:
lexsub = LexSub(os.path.join('../data','glove.6B.100d.retrofit.magnitude'))
output = []
with open(os.path.join('../data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point
english edge line position place front back while way point


## Evaluate the output

In [10]:
from lexsub_check import precision
with open(os.path.join('../data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=46.56


## Documentation

Write some beautiful documentation of your program here.

## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?

#### Lexicon file difference in output
Comparing all the different lexicon files.

In [19]:
lexsub = LexSub(os.path.join('data','glove.6B.100d.retrofit-ppdb.magnitude'))
output = []
with open(os.path.join('data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

place both back bottom away sides onto front edge line
both place along away back sides onto line bottom front
both place along away back line sides onto edge front
both place away back along onto bottom line sides front
both place along away back onto sides line bottom edge
back both onto away edge line place bottom front along
both place along back sides away line onto bottom front
both place along away back onto sides line bottom edge
both place along away back onto sides line bottom edge
along both place away line onto sides edge back front


In [20]:
from lexsub_check import precision
with open(os.path.join('data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=44.69


In [21]:
lexsub = LexSub(os.path.join('data','glove.6B.100d.retrofit-wordnet.magnitude'))
output = []
with open(os.path.join('data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

english place while back point edge way line position along
way place while position back english along point line front
english while way point place along line position edge back
way while place back english line position point along front
while place along way english point line position back front
while way back edge line point place english position along
place way while english position line point along back front
while place along way english point line position back front
while place along way english point line position back front
along edge english line point way place while back position


In [22]:
from lexsub_check import precision
with open(os.path.join('data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=45.51


In [23]:
lexsub = LexSub(os.path.join('data','glove.6B.100d.retrofit-wordnet+.magnitude'))
output = []
with open(os.path.join('data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

then back place bottom left way corner edge line front
way then back place along line front bottom left edge
way then along place line edge back left front corner
way then back place line bottom front along left edge
along then way place left back line front edge bottom
then way back edge left line front along place bottom
then place way line back along front left bottom edge
along then way place left back line front edge bottom
along then way place left back line front edge bottom
along line edge way then left place back front corner


In [24]:
from lexsub_check import precision
with open(os.path.join('data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=40.22


Combining the two best lexicon files produced less 

In [27]:
lexsub = LexSub(os.path.join('data','glove.6B.100d.retrofit-ppdb-wordnet.magnitude'))
output = []
with open(os.path.join('data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

place both back bottom away sides onto front edge line
both place along away back sides onto line bottom front
both place along away back line sides onto edge front
both place away back along onto bottom line sides front
both place along away back onto sides line bottom edge
back both onto away edge line place bottom front along
both place along back sides away line onto bottom front
both place along away back onto sides line bottom edge
both place along away back onto sides line bottom edge
along both place away line onto sides edge back front


In [28]:
from lexsub_check import precision
with open(os.path.join('data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=44.63


## Attempts to incorporate context words


for the following tests the ppdb-xl lexicon, and multiplicative combination, balMult (from paper mentioned) are used:
<ol>

<li> Candidates are the 20 most simaler words to target, baseline (10 candidates): 46.4474%</li>
</br>
<li>Consider set number of words around target (varied from 1 to full sentence) as context: ~30%</li>
</br>
<li>Consider all words in sentence, selects words above a threshold as context. 
As the threshold reaches 1, performance reaches baseline, else is lower than baseline.
</li>
</br>
<li>Consider set number of words around target (varied from 1 to full sentence) as context, score the candidates with each context word seperatly and keep best score per word: 39.45%</li>

</ol>

### Attempts using multiple lexicons 
for the following tests, the ppdb-xl lexicon as aware (retrofitted) lexicon and the orignal lexicon as basic lexicon, and balMult are used. Candidates are the 20 most similar words to target

<ol>
<li>
Candidates: aware </br>
balMut: aware </br>
score: 37%
</li>
</br>
<li>
Candidates: aware </br> 
balMut: basic </br>
score: 32% 
</li>
</br>
<li>
Candidates: basic </br>
balMut: aware </br>
score: 21% 
</li>
</br>
</br>
<li>
Candidates: aware </br>
balMut: (context: aware, target:basic) </br>
score: 34% 
</li>
</br>
<li>
Candidates: aware </br>
balMut: (context: basic, target:aware) </br>
score: 39% 