# Word vectors from SEC filings using Gensim: word2vec model 

## Data

<div style='direction:rtl; font-family: "B Nazanin"; font-size: 20px;'> 
در این نوت بوک، از داده‌های پیش‌پردازش شده در نوت بوک شماره ۶ (06_sec_preprocessing_julia.ipynb) استفاده شده است.
بنابراین نیاز است که ابتدا آن نوت بوک اجرا شود.

## Imports & Settings

In [2]:
using Pkg
using PyCall
using Conda

In [3]:
#Pkg.add("Glob")
#Pkg.add("TextAnalysis")
#Pkg.add("DataFrames")
#Pkg.add("Plots")
#Pkg.add("CSV")
#Pkg.add("Seaborn")
#Pkg.add("ScikitLearn")
#Pkg.add("Embeddings")
#Pkg.add("JSON")
#Pkg.add("StatsBase")

In [4]:
using Glob
using DataFrames
using Plots
using CSV
using StatsBase

In [5]:
#Conda.add("gensim")
@pyimport gensim

In [6]:
Word2Vec = gensim.models.Word2Vec
KeyedVectors = gensim.models.KeyedVectors
LineSentence = gensim.models.word2vec.LineSentence
Phrases = gensim.models.phrases.Phrases
Phraser = gensim.models.phrases.Phraser

PyObject <class 'gensim.models.phrases.FrozenPhrases'>

In [7]:
using ScikitLearn
using ScikitLearn: @sk_import
@sk_import decomposition: IncrementalPCA

PyObject <class 'sklearn.decomposition._incremental_pca.IncrementalPCA'>

In [8]:
function format_time(t)
    m = t ÷ 60
    s = t % 60
    h = m ÷ 60
    m = m % 60
    h = length("$h") == 2 ? h : "0$h"
    m = length("$m") == 2 ? m : "0$m"
    s = length("$s") == 2 ? s : "0$s"
    return "$h:$m:$s"
end

format_time (generic function with 1 method)

### Paths

In [9]:
sec_path = joinpath("..", "data", "sec-filings")
ngram_path = joinpath(sec_path, "ngrams")

"..\\data\\sec-filings\\ngrams"

In [10]:
results_path = joinpath("results", "sec-filings")

model_path = joinpath(results_path, "models")

if !(isdir(model_path))
    mkpath(model_path)
end

log_path = joinpath(results_path, "logs")

if !(isdir(log_path))
    mkpath(log_path)
end

## word2vec

In [11]:
analogies_path = joinpath("data", "analogies-en.txt")

"data\\analogies-en.txt"

### Set up Sentence Generator

In [12]:
NGRAMS = 2

2

To facilitate memory-efficient text ingestion, the LineSentence class creates a generator from individual sentences contained in the provided text file:

In [13]:
sentence_path = joinpath(ngram_path, "ngrams_$(NGRAMS).txt")
sentences = LineSentence(sentence_path)

PyObject <gensim.models.word2vec.LineSentence object at 0x000000005F213F40>

### Train word2vec Model

The [gensim.models.word2vec](https://radimrehurek.com/gensim/models/word2vec.html) class implements the skipgram and CBOW architectures.

In [14]:
start = time()
model = Word2Vec(sentences,
                 sg=1,          # 1 for skip-gram; otherwise CBOW
                 hs=0,          # hierarchical softmax if 1, negative sampling if 0
                 vector_size=300,      # Vector dimensionality
                 window=5,      # Max distance betw. current and predicted word
                 min_count=50,  # Ignore words with lower frequency
                 negative=15,    # noise word count for negative sampling
                 workers=4,     # no threads 
                 epochs=1,        # no epochs = iterations over corpus
                 alpha=0.05,   # initial learning rate
                 min_alpha=0.0001 # final learning rate
                ) 
println("Duration: $(format_time(floor(Int, time() - start)))")

Duration: 00:00:28


### Persist model & vectors

In [15]:
model.save(normpath(joinpath(model_path, "word2vec_0.model")))
model.wv.save(normpath(joinpath(model_path, "word_vectors_0.bin")))

### Load model and vectors

In [16]:
model = Word2Vec.load(normpath(joinpath(model_path, "word2vec_0.model")))

PyObject <gensim.models.word2vec.Word2Vec object at 0x00000000889BD730>

In [17]:
wv = KeyedVectors.load(normpath(joinpath(model_path, "word_vectors_0.bin")))

PyObject <gensim.models.keyedvectors.KeyedVectors object at 0x00000000889BDBE0>

### Get vocabulary

In [18]:
vocab = Any[]
for k ∈ model.wv.index_to_key
    v_index = model.wv.key_to_index[k]
    v_count = model.wv.get_vecattr(k, "count")
    push!(vocab, [k, v_index, v_count])
end

In [19]:
vocab = sort!(DataFrame(token=[v[1] for v in vocab], 
                        idx=[v[2] for v in vocab], 
                        count=[v[3] for v in vocab]), :count, rev=true)

Unnamed: 0_level_0,token,idx,count
Unnamed: 0_level_1,String,Int64,PyObject
1,million,0,PyObject 34435
2,company,1,PyObject 28594
3,financial,2,PyObject 24424
4,business,3,PyObject 22824
5,products,4,PyObject 22240
6,s,5,PyObject 20488
7,operations,6,PyObject 17778
8,net,7,PyObject 16563
9,sales,8,PyObject 16511
10,market,9,PyObject 16322


In [20]:
first(vocab, 10)

Unnamed: 0_level_0,token,idx,count
Unnamed: 0_level_1,String,Int64,PyObject
1,million,0,PyObject 34435
2,company,1,PyObject 28594
3,financial,2,PyObject 24424
4,business,3,PyObject 22824
5,products,4,PyObject 22240
6,s,5,PyObject 20488
7,operations,6,PyObject 17778
8,net,7,PyObject 16563
9,sales,8,PyObject 16511
10,market,9,PyObject 16322


In [21]:
describe(vocab["count"])
#.describe(percentiles=np.arange(.1, 1, .1)).astype(int)

Summary Stats:
Length:         5805
Type:           PyObject
Number Unique:  1497


### Evaluate Analogies

In [22]:
function accuracy_by_category(acc; detail=true)
    results = [[c["section"], length(c["correct"]), length(c["incorrect"])] for c ∈ acc]
    results = DataFrame(category=[result[1] for result ∈ results], 
                        correct=[result[2] for result ∈ results], 
                        incorrect=[result[3] for result ∈ results])
                        
    results[!, "average"] = results.correct./sum.(eachrow(results[:, ["correct", "incorrect"]]))
    if detail
        println(sort(results, :average, rev=true))
    end
    return convert(Array, filter(:category => ==("Total accuracy"), results))[2:end]
end

accuracy_by_category (generic function with 1 method)

In [23]:
detailed_accuracy = model.wv.evaluate_word_analogies(normpath(analogies_path), case_insensitive=true)[2]

15-element Vector{Dict{Any, Any}}:
 Dict("incorrect" => Any[], "correct" => Any[], "section" => "capital-common-countries")
 Dict("incorrect" => Any[], "correct" => Any[], "section" => "capital-world")
 Dict("incorrect" => [("CHICAGO", "ILLINOIS", "HOUSTON", "TEXAS"), ("CHICAGO", "ILLINOIS", "DALLAS", "TEXAS"), ("CHICAGO", "ILLINOIS", "BOSTON", "MASSACHUSETTS"), ("CHICAGO", "ILLINOIS", "SEATTLE", "WASHINGTON"), ("CHICAGO", "ILLINOIS", "CINCINNATI", "OHIO"), ("HOUSTON", "TEXAS", "CHICAGO", "ILLINOIS"), ("HOUSTON", "TEXAS", "BOSTON", "MASSACHUSETTS"), ("HOUSTON", "TEXAS", "SEATTLE", "WASHINGTON"), ("HOUSTON", "TEXAS", "CINCINNATI", "OHIO"), ("DALLAS", "TEXAS", "CHICAGO", "ILLINOIS")  …  ("SEATTLE", "WASHINGTON", "ATLANTA", "GEORGIA"), ("SEATTLE", "WASHINGTON", "CINCINNATI", "OHIO"), ("ATLANTA", "GEORGIA", "HOUSTON", "TEXAS"), ("ATLANTA", "GEORGIA", "DALLAS", "TEXAS"), ("ATLANTA", "GEORGIA", "SEATTLE", "WASHINGTON"), ("ATLANTA", "GEORGIA", "CINCINNATI", "OHIO"), ("CINCINNATI", "OHIO", "HO

In [24]:
summary = accuracy_by_category(detailed_accuracy)

15×4 DataFrame
│ Row │ category                    │ correct │ incorrect │ average   │
│     │ [90mString[39m                      │ [90mInt64[39m   │ [90mInt64[39m     │ [90mFloat64[39m   │
├─────┼─────────────────────────────┼─────────┼───────────┼───────────┤
│ 1   │ capital-common-countries    │ 0       │ 0         │ NaN       │
│ 2   │ capital-world               │ 0       │ 0         │ NaN       │
│ 3   │ family                      │ 0       │ 0         │ NaN       │
│ 4   │ city-in-state               │ 9       │ 31        │ 0.225     │
│ 5   │ gram6-nationality-adjective │ 12      │ 78        │ 0.133333  │
│ 6   │ gram8-plural                │ 4       │ 26        │ 0.133333  │
│ 7   │ gram9-plural-verbs          │ 6       │ 66        │ 0.0833333 │
│ 8   │ Total accuracy              │ 39      │ 755       │ 0.0491184 │
│ 9   │ gram3-comparative           │ 7       │ 233       │ 0.0291667 │
│ 10  │ gram5-present-participle    │ 1       │ 71        │ 0.0138889 │
│ 11  │ c

3-element Vector{Any}:
  39
 755
   0.0491183879093199

In [25]:
function eval_analogies(w2v, max_vocab=15000)
    accuracy = w2v.wv.evaluate_word_analogies(analogies_path, restrict_vocab=15000, case_insensitive=true)[2]
    
    results = [[c["section"], length(c["correct"]), length(c["incorrect"])] for c ∈ accuracy]
    results = DataFrame(category=[result[1] for result ∈ results], 
                        correct=[result[2] for result ∈ results], 
                        incorrect=[result[3] for result ∈ results])
    
    results[!, "average"] = results.correct./sum.(eachrow(results[:, ["correct", "incorrect"]]))
    return results
end

eval_analogies (generic function with 2 methods)

In [26]:
function total_accuracy(w2v)
    df = eval_analogies(w2v)
    return convert(Array, filter(:category => ==("Total accuracy"), results))[2:end]
end

total_accuracy (generic function with 1 method)

In [27]:
accuracy = eval_analogies(model)
accuracy

Unnamed: 0_level_0,category,correct,incorrect,average
Unnamed: 0_level_1,String,Int64,Int64,Float64
1,capital-common-countries,0,0,
2,capital-world,0,0,
3,city-in-state,9,31,0.225
4,currency,0,18,0.0
5,family,0,0,
6,gram1-adjective-to-adverb,0,72,0.0
7,gram2-opposite,0,30,0.0
8,gram3-comparative,7,233,0.0291667
9,gram4-superlative,0,20,0.0
10,gram5-present-participle,1,71,0.0138889


### Validate Vector Arithmetic

In [29]:
sims = model.wv.most_similar(positive=["phone"], restrict_vocab=15000)
sims_df = DataFrame(term=[pair[1] for pair ∈ sims], 
                        similarity=[pair[2] for pair ∈ sims])
println(sims_df)
#print(pd.DataFrame(sims, columns=["term", "similarity"]))

10×2 DataFrame
│ Row │ term        │ similarity │
│     │ [90mString[39m      │ [90mFloat64[39m    │
├─────┼─────────────┼────────────┤
│ 1   │ email       │ 0.837282   │
│ 2   │ portal      │ 0.836035   │
│ 3   │ voice       │ 0.802948   │
│ 4   │ touch       │ 0.791997   │
│ 5   │ communicate │ 0.78348    │
│ 6   │ transmit    │ 0.779755   │
│ 7   │ notebook    │ 0.77374    │
│ 8   │ windows     │ 0.768419   │
│ 9   │ viewing     │ 0.767883   │
│ 10  │ pcie        │ 0.761672   │


In [36]:
analogy = model.wv.most_similar(positive=["france", "london"], 
                                negative=["london"], 
                                restrict_vocab=15000)

analogy_df = DataFrame(term=[pair[1] for pair ∈ analogy], 
                        similarity=[pair[2] for pair ∈ analogy])
println(analogy_df)
#print(pd.DataFrame(analogy, columns=["term", "similarity"]))

10×2 DataFrame
│ Row │ term        │ similarity │
│     │ [90mString[39m      │ [90mFloat64[39m    │
├─────┼─────────────┼────────────┤
│ 1   │ switzerland │ 0.861892   │
│ 2   │ germany     │ 0.835466   │
│ 3   │ netherlands │ 0.82592    │
│ 4   │ zealand     │ 0.817587   │
│ 5   │ india       │ 0.816676   │
│ 6   │ thailand    │ 0.810973   │
│ 7   │ belgium     │ 0.810843   │
│ 8   │ kingdom     │ 0.808075   │
│ 9   │ australia   │ 0.798887   │
│ 10  │ sweden      │ 0.798025   │


### Check similarity for random words

In [38]:
VALID_SET = 5  # Random set of words to get nearest neighbors for
VALID_WINDOW = 100  # Most frequent words to draw validation set from
valid_examples = StatsBase.sample(1:VALID_WINDOW, VALID_SET, replace=false)
similars = DataFrame()

for id ∈ sort(valid_examples)
    word = vocab[id, "token"]
    similars[word] = [s[1] for s ∈ model.wv.most_similar(word)]
end
similars

Unnamed: 0_level_0,new,assets,value,development,significant
Unnamed: 0_level_1,String,String,String,String,String
1,existing,realizability,fair,research,substantial
2,enhancements,intangible,values,advancement,expend
3,innovations,liabilities,windset,biomaterials,considerable
4,introduce,recoverability,carrying,landec,predicted
5,hcf,existed,valuing,ideas,burden
6,biomaterials,asset,allocating,digitaloptics,perceptions
7,adapt,resolor,approximates,engineering,strain
8,eim,goodwill,implied,ew,unsuccessful
9,nfs,reversal,ars,garp,defiance
10,introductions,depreciated,difference,centennial,reputational


## Continue Training

In [39]:
accuracies = [summary]
best_accuracy = summary[end]
for i ∈ 1:14
    start = time()
    model.train(sentences, epochs=1, total_examples=model.corpus_count)
    detailed_accuracy = model.wv.evaluate_word_analogies(analogies_path)[2]
    push!(accuracies, accuracy_by_category(detailed_accuracy, detail=false))
    
    println("$(i) | Duration: $(format_time(time() - start)) | Accuracy: $(accuracies[end][end]) ")
    
    if accuracies[end][end] > best_accuracy
        model.save(normpath(joinpath(model_path, "word2vec_$(i).model")))
        model.wv.save(normpath(joinpath(model_path, "word_vectors_$(i).bin")))
        best_accuracy = accuracies[end][end]
    end
    
    accuracies_df = DataFrame(correct=[accuracy[1] for accuracy ∈ accuracies], 
                            wrong=[accuracy[2] for accuracy ∈ accuracies], 
                            average=[accuracy[3] for accuracy ∈ accuracies])
    CSV.write(joinpath(model_path, "accuracies.csv"), accuracies_df)
end
model.wv.save(normpath(joinpath(model_path, "word_vectors_final.bin")))

1 | Duration: 00.0:00.0:024.515000104904175 | Accuracy: 0.060453400503778336 
2 | Duration: 00.0:00.0:022.170000076293945 | Accuracy: 0.08060453400503778 
3 | Duration: 00.0:00.0:023.908999919891357 | Accuracy: 0.07934508816120907 
4 | Duration: 00.0:00.0:033.63499999046326 | Accuracy: 0.08438287153652393 
5 | Duration: 00.0:00.0:036.91000008583069 | Accuracy: 0.08438287153652393 
6 | Duration: 00.0:00.0:035.60199999809265 | Accuracy: 0.1070528967254408 
7 | Duration: 00.0:00.0:031.34999990463257 | Accuracy: 0.0818639798488665 
8 | Duration: 00.0:00.0:029.175000190734863 | Accuracy: 0.09445843828715365 
9 | Duration: 00.0:00.0:028.111000061035156 | Accuracy: 0.10075566750629723 
10 | Duration: 00.0:00.0:032.92199993133545 | Accuracy: 0.09193954659949623 
11 | Duration: 00.0:00.0:033.99000000953674 | Accuracy: 0.09949622166246852 
12 | Duration: 00.0:00.0:032.57200002670288 | Accuracy: 0.10957178841309824 
13 | Duration: 00.0:00.0:030.29100012779236 | Accuracy: 0.11209068010075567 
14 |

### Sample Output


|Epoch|Duration| Accuracy|
|---|---|---|
01 | 00:14:00 | 31.64% | 
02 | 00:14:21 | 31.72% | 
03 | 00:14:34 | 33.65% | 
04 | 00:16:11 | 34.03% | 
05 | 00:13:51 | 33.04% | 
06 | 00:13:46 | 33.28% | 
07 | 00:13:51 | 33.10% | 
08 | 00:13:54 | 34.11% | 
09 | 00:13:54 | 33.70% | 
10 | 00:13:55 | 34.09% | 
11 | 00:13:57 | 35.06% | 
12 | 00:13:38 | 33.79% | 
13 | 00:13:26 | 32.40% | 

In [40]:
accuracies_df = DataFrame(correct=[accuracy[1] for accuracy ∈ accuracies], 
                            wrong=[accuracy[2] for accuracy ∈ accuracies], 
                            average=[accuracy[3] for accuracy ∈ accuracies])

CSV.write(joinpath(results_path, "accuracies.csv"), accuracies_df)

"results\\sec-filings\\accuracies.csv"

In [43]:
best_model = Word2Vec.load(normpath(joinpath(model_path, "word2vec_12.model")))

PyObject <gensim.models.word2vec.Word2Vec object at 0x000000008849CF10>

In [44]:
detailed_accuracy = best_model.wv.evaluate_word_analogies(normpath(analogies_path), case_insensitive=true)[2]

15-element Vector{Dict{Any, Any}}:
 Dict("incorrect" => Any[], "correct" => Any[], "section" => "capital-common-countries")
 Dict("incorrect" => Any[], "correct" => Any[], "section" => "capital-world")
 Dict("incorrect" => [("CHICAGO", "ILLINOIS", "DALLAS", "TEXAS"), ("CHICAGO", "ILLINOIS", "SEATTLE", "WASHINGTON"), ("CHICAGO", "ILLINOIS", "CINCINNATI", "OHIO"), ("HOUSTON", "TEXAS", "SEATTLE", "WASHINGTON"), ("HOUSTON", "TEXAS", "ATLANTA", "GEORGIA"), ("HOUSTON", "TEXAS", "CINCINNATI", "OHIO"), ("DALLAS", "TEXAS", "SEATTLE", "WASHINGTON"), ("DALLAS", "TEXAS", "ATLANTA", "GEORGIA"), ("DALLAS", "TEXAS", "CINCINNATI", "OHIO"), ("BOSTON", "MASSACHUSETTS", "HOUSTON", "TEXAS")  …  ("BOSTON", "MASSACHUSETTS", "SEATTLE", "WASHINGTON"), ("BOSTON", "MASSACHUSETTS", "CINCINNATI", "OHIO"), ("SEATTLE", "WASHINGTON", "HOUSTON", "TEXAS"), ("SEATTLE", "WASHINGTON", "DALLAS", "TEXAS"), ("SEATTLE", "WASHINGTON", "ATLANTA", "GEORGIA"), ("ATLANTA", "GEORGIA", "DALLAS", "TEXAS"), ("ATLANTA", "GEORGIA", "SE

In [45]:
summary = accuracy_by_category(detailed_accuracy)
println("\nBase Accuracy: Correct $(summary[1]) | Wrong $(summary[2]) | Avg $(summary[3])")

15×4 DataFrame
│ Row │ category                    │ correct │ incorrect │ average   │
│     │ [90mString[39m                      │ [90mInt64[39m   │ [90mInt64[39m     │ [90mFloat64[39m   │
├─────┼─────────────────────────────┼─────────┼───────────┼───────────┤
│ 1   │ capital-common-countries    │ 0       │ 0         │ NaN       │
│ 2   │ capital-world               │ 0       │ 0         │ NaN       │
│ 3   │ family                      │ 0       │ 0         │ NaN       │
│ 4   │ city-in-state               │ 19      │ 21        │ 0.475     │
│ 5   │ gram6-nationality-adjective │ 27      │ 63        │ 0.3       │
│ 6   │ Total accuracy              │ 87      │ 707       │ 0.109572  │
│ 7   │ gram8-plural                │ 3       │ 27        │ 0.1       │
│ 8   │ gram9-plural-verbs          │ 7       │ 65        │ 0.0972222 │
│ 9   │ gram3-comparative           │ 23      │ 217       │ 0.0958333 │
│ 10  │ gram7-past-tense            │ 5       │ 105       │ 0.0454545 │
│ 11  │ g

In [46]:
cat_dict = Dict("capital-common-countries" => "Capitals",
            "capital-world" => "Capitals RoW",
            "city-in-state" => "City-State",
            "currency" => "Currency",
            "family" => "Famliy",
            "gram1-adjective-to-adverb" => "Adj-Adverb",
            "gram2-opposite" => "Opposite",
            "gram3-comparative" => "Comparative",
            "gram4-superlative" => "Superlative",
            "gram5-present-participle" => "Pres. Part.",
            "gram6-nationality-adjective" => "Nationality",
            "gram7-past-tense" => "Past Tense",
            "gram8-plural" => "Plural",
            "gram9-plural-verbs" => "Plural Verbs",
            "total" => "Total")

Dict{String, String} with 15 entries:
  "capital-common-countries"    => "Capitals"
  "gram5-present-participle"    => "Pres. Part."
  "family"                      => "Famliy"
  "gram7-past-tense"            => "Past Tense"
  "gram8-plural"                => "Plural"
  "gram2-opposite"              => "Opposite"
  "total"                       => "Total"
  "city-in-state"               => "City-State"
  "gram3-comparative"           => "Comparative"
  "gram1-adjective-to-adverb"   => "Adj-Adverb"
  "gram4-superlative"           => "Superlative"
  "gram6-nationality-adjective" => "Nationality"
  "currency"                    => "Currency"
  "capital-world"               => "Capitals RoW"
  "gram9-plural-verbs"          => "Plural Verbs"

In [47]:
results = [[c["section"], length(c["correct"]), length(c["incorrect"])] for c ∈ detailed_accuracy]
results = DataFrame(category=[result[1] for result ∈ results], 
                    correct=[result[2] for result ∈ results], 
                    incorrect=[result[3] for result ∈ results])

for item ∈ keys(cat_dict)
    replace!(results[!, "category"], item => cat_dict[item])
end

results[!, "average"] = results.correct./sum.(eachrow(results[:, ["correct", "incorrect"]]))
rename!(results, uppercasefirst.(names(results)))
total = filter(row -> row.Category == "Total accuracy", results)
results = filter(row -> row.Category != "Total accuracy", results)
println(results)
println("\n Total: $(total)")

14×4 DataFrame
│ Row │ Category     │ Correct │ Incorrect │ Average   │
│     │ [90mString[39m       │ [90mInt64[39m   │ [90mInt64[39m     │ [90mFloat64[39m   │
├─────┼──────────────┼─────────┼───────────┼───────────┤
│ 1   │ Capitals     │ 0       │ 0         │ NaN       │
│ 2   │ Capitals RoW │ 0       │ 0         │ NaN       │
│ 3   │ City-State   │ 19      │ 21        │ 0.475     │
│ 4   │ Currency     │ 0       │ 18        │ 0.0       │
│ 5   │ Famliy       │ 0       │ 0         │ NaN       │
│ 6   │ Adj-Adverb   │ 1       │ 71        │ 0.0138889 │
│ 7   │ Opposite     │ 0       │ 30        │ 0.0       │
│ 8   │ Comparative  │ 23      │ 217       │ 0.0958333 │
│ 9   │ Superlative  │ 0       │ 20        │ 0.0       │
│ 10  │ Pres. Part.  │ 2       │ 70        │ 0.0277778 │
│ 11  │ Nationality  │ 27      │ 63        │ 0.3       │
│ 12  │ Past Tense   │ 5       │ 105       │ 0.0454545 │
│ 13  │ Plural       │ 3       │ 27        │ 0.1       │
│ 14  │ Plural Verbs │ 7       │ 

In [49]:
most_sim = best_model.wv.most_similar(positive=["france", "london"], negative=["london"], topn=20)
most_sim_df = DataFrame(token=[pair[1] for pair ∈ most_sim], 
                        similarity=[pair[2] for pair ∈ most_sim])

Unnamed: 0_level_0,token,similarity
Unnamed: 0_level_1,String,Float64
1,sweden,0.490823
2,germany,0.489274
3,kingdom,0.486661
4,netherlands,0.462996
5,italy,0.413548
6,canada,0.409447
7,singapore,0.399187
8,spain,0.394387
9,india,0.388033
10,japan,0.385139


In [None]:
"""
fig, axes = plt.subplots(figsize=(16, 5), ncols=2)

axes[0] = results.loc[:, ["Correct", "Incorrect"]].plot.bar(stacked=True, ax=axes[0]
                                                           , title="Analogy Accuracy")
ax1 = results.loc[:, ["Average"]].plot(ax=axes[0], secondary_y=True, lw=1, c="k", rot=35)
ax1.yaxis.set_major_formatter(FuncFormatter(lambda y, _: "{:.0%}".format(y)))

(pd.DataFrame(most_sim, columns=["token", "similarity"])
 .set_index("token").similarity
 .sort_values().tail(10).plot.barh(xlim=(.3, .37), ax=axes[1], title="Closest matches for Woman + King - Man"))
fig.tight_layout();
"""