In [41]:
from gensim.models import FastText
import pandas as pd
from gensim.test.utils import datapath

In [4]:
# Load dataset for AskWomen subreddit

df = pd.read_hdf("../datasets/AskWomen.h5","AskWomen")

In [5]:
token_list = df["token_list"].tolist()
flat_list = [item for sublist in token_list for item in sublist]

In [34]:
# Train model for AskWomen with old hyperparameters
model_sent = FastText(flat_list, window=6, min_count=25,workers=12,sg=1, vector_size=50)


In [36]:
# Apply Wordsim353 similarity evaluation

similarities = model_sent.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))

In [37]:
similarities

(PearsonRResult(statistic=0.5349042705298082, pvalue=7.224140842772808e-21),
 SpearmanrResult(correlation=0.5322409875667318, pvalue=1.2192381261115325e-20),
 25.495750708215297)

In [38]:
# Apply Google Questions analogy evaluation

analogy_scores = model_sent.wv.evaluate_word_analogies(datapath('questions-words.txt'))

In [39]:
analogy_scores

(0.44954916273078577,
 [{'section': 'capital-common-countries',
   'correct': [],
   'incorrect': [('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'),
    ('LONDON', 'ENGLAND', 'ROME', 'ITALY'),
    ('LONDON', 'ENGLAND', 'TOKYO', 'JAPAN'),
    ('PARIS', 'FRANCE', 'ROME', 'ITALY'),
    ('PARIS', 'FRANCE', 'TOKYO', 'JAPAN'),
    ('PARIS', 'FRANCE', 'LONDON', 'ENGLAND'),
    ('ROME', 'ITALY', 'TOKYO', 'JAPAN'),
    ('ROME', 'ITALY', 'LONDON', 'ENGLAND'),
    ('ROME', 'ITALY', 'PARIS', 'FRANCE'),
    ('TOKYO', 'JAPAN', 'LONDON', 'ENGLAND'),
    ('TOKYO', 'JAPAN', 'PARIS', 'FRANCE'),
    ('TOKYO', 'JAPAN', 'ROME', 'ITALY')]},
  {'section': 'capital-world',
   'correct': [],
   'incorrect': [('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'),
    ('LONDON', 'ENGLAND', 'ROME', 'ITALY'),
    ('PARIS', 'FRANCE', 'ROME', 'ITALY'),
    ('PARIS', 'FRANCE', 'TOKYO', 'JAPAN'),
    ('ROME', 'ITALY', 'TOKYO', 'JAPAN')]},
  {'section': 'currency',
   'correct': [('EUROPE', 'EURO', 'CANADA', 'DOLLAR'),
    ('USA', 'DOLLAR'

In [40]:
for i in analogy_scores[1]:
    print(i["section"], len(i["correct"]), "&" , len(i["incorrect"]))

capital-common-countries 0 & 12
capital-world 0 & 5
currency 2 & 16
city-in-state 0 & 25
family 197 & 109
gram1-adjective-to-adverb 383 & 267
gram2-opposite 184 & 158
gram3-comparative 19 & 71
gram4-superlative 2 & 4
gram5-present-participle 85 & 155
gram6-nationality-adjective 169 & 416
gram7-past-tense 0 & 30
gram8-plural 6 & 14
gram9-plural-verbs 0 & 0
Total accuracy 1047 & 1282


In [42]:
# Load model for AskWomen with new hyperparameters

model_sent = FastText.load("../models/subreddit_level/fasttext_AskWomen_senti.bin")


In [29]:
# Apply Wordsim353 similarity evaluation

similarities = model_sent.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))

In [30]:
similarities

(PearsonRResult(statistic=0.5650782767482803, pvalue=8.816238380546142e-26),
 SpearmanrResult(correlation=0.5784672296188463, pvalue=3.2923444761863e-27),
 18.13031161473088)

In [31]:
# Apply Google Questions analogy evaluation

analogy_scores = model_sent.wv.evaluate_word_analogies(datapath('questions-words.txt'))

In [32]:
analogy_scores

(0.2860536138445877,
 [{'section': 'capital-common-countries',
   'correct': [('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'),
    ('PARIS', 'FRANCE', 'LONDON', 'ENGLAND'),
    ('TOKYO', 'JAPAN', 'ROME', 'ITALY')],
   'incorrect': [('ATHENS', 'GREECE', 'BERLIN', 'GERMANY'),
    ('ATHENS', 'GREECE', 'LONDON', 'ENGLAND'),
    ('ATHENS', 'GREECE', 'PARIS', 'FRANCE'),
    ('ATHENS', 'GREECE', 'ROME', 'ITALY'),
    ('ATHENS', 'GREECE', 'STOCKHOLM', 'SWEDEN'),
    ('ATHENS', 'GREECE', 'TOKYO', 'JAPAN'),
    ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'),
    ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'),
    ('BERLIN', 'GERMANY', 'ROME', 'ITALY'),
    ('BERLIN', 'GERMANY', 'STOCKHOLM', 'SWEDEN'),
    ('BERLIN', 'GERMANY', 'TOKYO', 'JAPAN'),
    ('BERLIN', 'GERMANY', 'ATHENS', 'GREECE'),
    ('LONDON', 'ENGLAND', 'ROME', 'ITALY'),
    ('LONDON', 'ENGLAND', 'STOCKHOLM', 'SWEDEN'),
    ('LONDON', 'ENGLAND', 'TOKYO', 'JAPAN'),
    ('LONDON', 'ENGLAND', 'ATHENS', 'GREECE'),
    ('LONDON', 'ENGLAND', 'BERLIN', 'G

In [33]:
for i in analogy_scores[1]:
    print(i["section"], len(i["correct"]), "&" , len(i["incorrect"]))

capital-common-countries 3 & 39
capital-world 1 & 28
currency 0 & 18
city-in-state 3 & 108
family 211 & 131
gram1-adjective-to-adverb 269 & 543
gram2-opposite 132 & 288
gram3-comparative 11 & 79
gram4-superlative 1 & 11
gram5-present-participle 65 & 277
gram6-nationality-adjective 138 & 447
gram7-past-tense 1 & 71
gram8-plural 8 & 64
gram9-plural-verbs 0 & 0
Total accuracy 843 & 2104
