## Imports & Settings

In [1]:
from time import time
import warnings
from collections import Counter
from pathlib import Path
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [2]:
warnings.filterwarnings('ignore')

In [3]:
analogies_path = Path('data', 'analogies', 'analogies-en.txt')

## Convert GloVE Vectors to gensim format

The various GloVE vectors are available [here](https://nlp.stanford.edu/projects/glove/). Download link for the [wikipedia](http://nlp.stanford.edu/data/glove.6B.zip) version. Unzip and store in `data/glove`.

### WikiPedia

In [4]:
glove_path = Path('data/glove')
glove_wiki_file= glove_path / 'glove.6B.300d.txt'
word2vec_wiki_file = glove_path / 'glove.wiki.gensim.txt'

In [None]:
glove2word2vec(glove_input_file=glove_wiki_file, word2vec_output_file=word2vec_wiki_file)

### Twitter Data

In [18]:
glove_twitter_file= glove_path / 'glove.twitter.27B.200d.txt'
word2vec_twitter_file = glove_path / 'glove.twitter.gensim.txt'

In [19]:
glove2word2vec(glove_input_file=glove_twitter_file, word2vec_output_file=word2vec_twitter_file)

(1193517, 200)

### Common Crawl

In [26]:
glove_crawl_file= glove_path / 'glove.840B.300d.txt'
word2vec_crawl_file = glove_path / 'glove.crawl.gensim.txt'

In [27]:
glove2word2vec(glove_input_file=glove_crawl_file, word2vec_output_file=word2vec_crawl_file)

(2196018, 300)

## Evaluate embeddings

In [37]:
def eval_analogies(file_name, vocab=30000):
    model = KeyedVectors.load_word2vec_format(file_name, binary=False)
    accuracy = model.wv.accuracy(analogies_path,
                                 restrict_vocab=vocab,
                                 case_insensitive=True)
    return (pd.DataFrame([[c['section'],
                           len(c['correct']),
                           len(c['incorrect'])] for c in accuracy],
                         columns=['category', 'correct', 'incorrect'])
            .assign(samples=lambda x: x.correct.add(x.incorrect))
            .assign(average=lambda x: x.correct.div(x.samples))
            .drop(['correct', 'incorrect'], axis=1))

In [40]:
result = eval_analogies(word2vec_twitter_file, vocab=100000)

### twitter result

In [41]:
result

Unnamed: 0,category,samples,average
0,capital-common-countries,462,0.701299
1,capital-world,930,0.690323
2,city-in-state,3644,0.350714
3,currency,268,0.018657
4,family,342,0.824561
5,gram1-adjective-to-adverb,650,0.143077
6,gram2-opposite,342,0.365497
7,gram3-comparative,1260,0.757937
8,gram4-superlative,930,0.686022
9,gram5-present-participle,702,0.750712


### wiki result

In [39]:
result

Unnamed: 0,category,samples,average
0,capital-common-countries,506,0.948617
1,capital-world,8372,0.964644
2,city-in-state,4242,0.599953
3,currency,752,0.174202
4,family,506,0.881423
5,gram1-adjective-to-adverb,992,0.225806
6,gram2-opposite,756,0.285714
7,gram3-comparative,1332,0.882132
8,gram4-superlative,1056,0.746212
9,gram5-present-participle,1056,0.699811


### Common Crawl result

In [33]:
result

Unnamed: 0,category,samples,average
0,capital-common-countries,506,0.94664
1,capital-world,4290,0.917483
2,city-in-state,4242,0.706742
3,currency,206,0.184466
4,family,420,0.978571
5,gram1-adjective-to-adverb,992,0.388105
6,gram2-opposite,702,0.363248
7,gram3-comparative,1332,0.876877
8,gram4-superlative,1122,0.919786
9,gram5-present-participle,1056,0.827652


In [16]:
result

Unnamed: 0,category,correct,incorrect,average
0,capital-common-countries,482,24,0.952569
1,capital-world,6093,227,0.964082
2,city-in-state,2472,1646,0.600291
3,currency,112,390,0.223108
4,family,392,28,0.933333
5,gram1-adjective-to-adverb,228,764,0.229839
6,gram2-opposite,205,497,0.292023
7,gram3-comparative,1175,157,0.882132
8,gram4-superlative,737,193,0.792473
9,gram5-present-participle,686,306,0.691532


In [17]:
result.to_csv(glove_path / 'accuracy.csv', index=False)