In [1]:
import gensim
import logging
import os
import re
import string

In [2]:
# basic logging setup
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [25]:
base_dir = "accessible-v4.0-small"

prev_words = 0
total_words = 0

In [36]:
class MyArticles(object):
    def __init__(self, base_dir, cur_paper):
        self.base_dir = base_dir
        self.cur_paper = cur_paper
                                             
    def __iter__(self):
        logging.info("Opening paper {0}".format(self.cur_paper))
        
        issues = os.listdir(base_dir + "/" + self.cur_paper)
        
        for issue in issues:
            if not issue.startswith('.'):
               # logging.info("Opening issue {0}".format(issue))
                articles = os.listdir(base_dir + "/" + self.cur_paper + "/" + issue)

                for article in articles:
                    if not article.startswith('.'):
                        # logging.info("Reading article {0}".format(article))
                            
                        articleFile = open(base_dir + "/" + self.cur_paper + "/" + issue + "/" + article, "r")
                        articleText = articleFile.read()
                        articleFile.close()
                    
                        # create word list for the article; could refine to be sentences later 
                        articleWords = []
               
                        # ignore single-char words and words with numbers in them                        
                        for word in re.split('\W+', articleText):
                            if len(word) > 1 and not any(char.isdigit() for char in word):
                                # lowercase and add to list
                                articleWords.append(word.lower())
                                        
                        yield articleWords

In [38]:
# get DouglassMonthly docs into the list of list formas
cur_paper = "DouglassMonthly"
articles = MyArticles(base_dir, cur_paper)

In [39]:
# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=5, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=10) # parallel processing; needs Cython

# save model
model.save(cur_paper + "-w2v-model")


2019-01-16 22:06:39,328 : INFO : collecting all words and their counts
2019-01-16 22:06:39,329 : INFO : Opening paper DouglassMonthly
2019-01-16 22:06:39,332 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-16 22:06:41,113 : INFO : collected 105681 word types from a corpus of 1233295 raw words and 1718 sentences
2019-01-16 22:06:41,113 : INFO : Loading a fresh vocabulary
2019-01-16 22:06:41,160 : INFO : min_count=5 retains 14580 unique words (13% of original 105681, drops 91101)
2019-01-16 22:06:41,161 : INFO : min_count=5 leaves 1110087 word corpus (90% of original 1233295, drops 123208)
2019-01-16 22:06:41,193 : INFO : deleting the raw counts dictionary of 105681 items
2019-01-16 22:06:41,196 : INFO : sample=0.001 downsamples 44 most-common words
2019-01-16 22:06:41,197 : INFO : downsampling leaves estimated 807646 word corpus (72.8% of prior 1110087)
2019-01-16 22:06:41,232 : INFO : estimated required memory for 14580 words and 100 dimensions: 18954

In [41]:
# get FrankLesliesWeekly docs into the list of list formas
cur_paper = "FrankLesliesWeekly"
articles = MyArticles(base_dir, cur_paper)

# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=5, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=10) # parallel processing; needs Cython

# save model
model.save(cur_paper + "-w2v-model")


2019-01-16 22:09:17,904 : INFO : collecting all words and their counts
2019-01-16 22:09:17,905 : INFO : Opening paper FrankLesliesWeekly
2019-01-16 22:09:17,908 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-16 22:09:35,529 : INFO : PROGRESS: at sentence #10000, processed 12438832 words, keeping 115416 word types
2019-01-16 22:09:43,344 : INFO : collected 137965 word types from a corpus of 18074868 raw words and 14430 sentences
2019-01-16 22:09:43,345 : INFO : Loading a fresh vocabulary
2019-01-16 22:09:43,475 : INFO : min_count=5 retains 48558 unique words (35% of original 137965, drops 89407)
2019-01-16 22:09:43,476 : INFO : min_count=5 leaves 17938038 word corpus (99% of original 18074868, drops 136830)
2019-01-16 22:09:43,577 : INFO : deleting the raw counts dictionary of 137965 items
2019-01-16 22:09:43,580 : INFO : sample=0.001 downsamples 45 most-common words
2019-01-16 22:09:43,582 : INFO : downsampling leaves estimated 13762113 word corpus (

2019-01-16 22:10:36,573 : INFO : EPOCH 2 - PROGRESS: at 74.12% examples, 458090 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:37,578 : INFO : EPOCH 2 - PROGRESS: at 77.91% examples, 458904 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:38,585 : INFO : EPOCH 2 - PROGRESS: at 81.14% examples, 459377 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:39,590 : INFO : EPOCH 2 - PROGRESS: at 84.44% examples, 459778 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:40,598 : INFO : EPOCH 2 - PROGRESS: at 87.88% examples, 460142 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:41,601 : INFO : EPOCH 2 - PROGRESS: at 91.35% examples, 460530 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:42,614 : INFO : EPOCH 2 - PROGRESS: at 94.84% examples, 460925 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:43,621 : INFO : EPOCH 2 - PROGRESS: at 97.93% examples, 460506 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:10:44,258 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-

2019-01-16 22:11:31,364 : INFO : EPOCH 4 - PROGRESS: at 53.28% examples, 454032 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:32,370 : INFO : EPOCH 4 - PROGRESS: at 56.85% examples, 454681 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:33,371 : INFO : EPOCH 4 - PROGRESS: at 60.39% examples, 455284 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:34,381 : INFO : EPOCH 4 - PROGRESS: at 63.67% examples, 454936 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:35,397 : INFO : EPOCH 4 - PROGRESS: at 67.15% examples, 455161 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:36,411 : INFO : EPOCH 4 - PROGRESS: at 70.64% examples, 456082 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:37,416 : INFO : EPOCH 4 - PROGRESS: at 73.90% examples, 456627 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:38,439 : INFO : EPOCH 4 - PROGRESS: at 77.65% examples, 456941 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:11:39,442 : INFO : EPOCH 4 - PROGRESS: at 80.76% examples, 457231 words/s, in_qsiz

In [42]:
# get FrederickDouglassPaper docs into the list of list formas
cur_paper = "FrederickDouglassPaper"
articles = MyArticles(base_dir, cur_paper)

# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=5, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=10) # parallel processing; needs Cython

# save model
model.save(cur_paper + "-w2v-model")

2019-01-16 22:12:23,384 : INFO : collecting all words and their counts
2019-01-16 22:12:23,384 : INFO : Opening paper FrederickDouglassPaper
2019-01-16 22:12:23,387 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-16 22:12:31,315 : INFO : PROGRESS: at sentence #10000, processed 5604256 words, keeping 57600 word types
2019-01-16 22:12:31,847 : INFO : collected 58840 word types from a corpus of 5955695 raw words and 10601 sentences
2019-01-16 22:12:31,848 : INFO : Loading a fresh vocabulary
2019-01-16 22:12:31,893 : INFO : min_count=5 retains 25130 unique words (42% of original 58840, drops 33710)
2019-01-16 22:12:31,894 : INFO : min_count=5 leaves 5899851 word corpus (99% of original 5955695, drops 55844)
2019-01-16 22:12:31,948 : INFO : deleting the raw counts dictionary of 58840 items
2019-01-16 22:12:31,950 : INFO : sample=0.001 downsamples 48 most-common words
2019-01-16 22:12:31,950 : INFO : downsampling leaves estimated 4366194 word corpus (74.0% 

2019-01-16 22:13:06,193 : INFO : EPOCH 4 - PROGRESS: at 41.40% examples, 447949 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:13:07,193 : INFO : EPOCH 4 - PROGRESS: at 51.25% examples, 446076 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:13:08,194 : INFO : EPOCH 4 - PROGRESS: at 61.00% examples, 442146 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:13:09,194 : INFO : EPOCH 4 - PROGRESS: at 72.18% examples, 442909 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:13:10,207 : INFO : EPOCH 4 - PROGRESS: at 81.75% examples, 441823 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:13:11,213 : INFO : EPOCH 4 - PROGRESS: at 92.73% examples, 441986 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:13:11,946 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-01-16 22:13:11,946 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-01-16 22:13:11,947 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-01-16 22:13:11,947 : INFO : worker thread fi

In [43]:
cur_paper = "FreedomsJournal"
articles = MyArticles(base_dir, cur_paper)

# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=5, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=1) # parallel processing; needs Cython

# save model
model.save(cur_paper + "-w2v-model")

2019-01-16 22:13:30,967 : INFO : collecting all words and their counts
2019-01-16 22:13:30,967 : INFO : Opening paper FreedomsJournal
2019-01-16 22:13:31,130 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-16 22:13:34,032 : INFO : collected 32478 word types from a corpus of 988768 raw words and 2047 sentences
2019-01-16 22:13:34,033 : INFO : Loading a fresh vocabulary
2019-01-16 22:13:34,057 : INFO : min_count=5 retains 11715 unique words (36% of original 32478, drops 20763)
2019-01-16 22:13:34,058 : INFO : min_count=5 leaves 953698 word corpus (96% of original 988768, drops 35070)
2019-01-16 22:13:34,084 : INFO : deleting the raw counts dictionary of 32478 items
2019-01-16 22:13:34,086 : INFO : sample=0.001 downsamples 43 most-common words
2019-01-16 22:13:34,087 : INFO : downsampling leaves estimated 694133 word corpus (72.8% of prior 953698)
2019-01-16 22:13:34,113 : INFO : estimated required memory for 11715 words and 100 dimensions: 15229500 byte

In [44]:
cur_paper = "GodeysLadysBook"
articles = MyArticles(base_dir, cur_paper)

# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=5, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=1) # parallel processing; needs Cython

# save model
model.save(cur_paper + "-w2v-model")

2019-01-16 22:14:00,581 : INFO : collecting all words and their counts
2019-01-16 22:14:00,582 : INFO : Opening paper GodeysLadysBook
2019-01-16 22:14:00,685 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-16 22:14:20,406 : INFO : PROGRESS: at sentence #10000, processed 11903162 words, keeping 97865 word types
2019-01-16 22:14:39,500 : INFO : PROGRESS: at sentence #20000, processed 23444152 words, keeping 137244 word types
2019-01-16 22:14:39,604 : INFO : collected 137484 word types from a corpus of 23511751 raw words and 20053 sentences
2019-01-16 22:14:39,605 : INFO : Loading a fresh vocabulary
2019-01-16 22:14:39,721 : INFO : min_count=5 retains 48884 unique words (35% of original 137484, drops 88600)
2019-01-16 22:14:39,721 : INFO : min_count=5 leaves 23375641 word corpus (99% of original 23511751, drops 136110)
2019-01-16 22:14:39,825 : INFO : deleting the raw counts dictionary of 137484 items
2019-01-16 22:14:39,828 : INFO : sample=0.001 downsam

2019-01-16 22:15:38,858 : INFO : EPOCH 2 - PROGRESS: at 44.94% examples, 442125 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:15:39,861 : INFO : EPOCH 2 - PROGRESS: at 47.08% examples, 442193 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:15:40,863 : INFO : EPOCH 2 - PROGRESS: at 49.68% examples, 442013 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:15:41,876 : INFO : EPOCH 2 - PROGRESS: at 52.23% examples, 442265 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:15:42,893 : INFO : EPOCH 2 - PROGRESS: at 55.05% examples, 442492 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:15:43,905 : INFO : EPOCH 2 - PROGRESS: at 57.64% examples, 442077 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:15:44,911 : INFO : EPOCH 2 - PROGRESS: at 60.26% examples, 441487 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:15:45,916 : INFO : EPOCH 2 - PROGRESS: at 62.86% examples, 441142 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:15:46,928 : INFO : EPOCH 2 - PROGRESS: at 65.36% examples, 439936 words/s, in_qsiz

2019-01-16 22:16:48,322 : INFO : EPOCH 4 - PROGRESS: at 17.15% examples, 435861 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:16:49,324 : INFO : EPOCH 4 - PROGRESS: at 19.75% examples, 434652 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:16:50,341 : INFO : EPOCH 4 - PROGRESS: at 21.95% examples, 432760 words/s, in_qsize 2, out_qsize 0
2019-01-16 22:16:51,341 : INFO : EPOCH 4 - PROGRESS: at 24.39% examples, 432986 words/s, in_qsize 2, out_qsize 0
2019-01-16 22:16:52,354 : INFO : EPOCH 4 - PROGRESS: at 26.96% examples, 431679 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:16:53,354 : INFO : EPOCH 4 - PROGRESS: at 29.34% examples, 432017 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:16:54,362 : INFO : EPOCH 4 - PROGRESS: at 31.58% examples, 431483 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:16:55,374 : INFO : EPOCH 4 - PROGRESS: at 33.55% examples, 425972 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:16:56,375 : INFO : EPOCH 4 - PROGRESS: at 35.87% examples, 424687 words/s, in_qsiz

2019-01-16 22:17:58,934 : INFO : EPOCH 5 - PROGRESS: at 89.07% examples, 432097 words/s, in_qsize 2, out_qsize 0
2019-01-16 22:17:59,945 : INFO : EPOCH 5 - PROGRESS: at 91.57% examples, 432465 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:18:00,950 : INFO : EPOCH 5 - PROGRESS: at 93.88% examples, 432717 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:18:01,955 : INFO : EPOCH 5 - PROGRESS: at 96.50% examples, 432795 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:18:02,971 : INFO : EPOCH 5 - PROGRESS: at 99.01% examples, 432977 words/s, in_qsize 2, out_qsize 0
2019-01-16 22:18:03,417 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-16 22:18:03,417 : INFO : EPOCH - 5 : training on 23511751 raw words (17671731 effective words) took 40.8s, 433201 effective words/s
2019-01-16 22:18:03,418 : INFO : training on a 117558755 raw words (88357564 effective words) took 203.1s, 435122 effective words/s
2019-01-16 22:18:03,421 : INFO : saving Word2Vec object under GodeysLady

In [45]:
cur_paper = "NationalAntiSlaveryStandard"
articles = MyArticles(base_dir, cur_paper)

# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=5, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=10) # parallel processing; needs Cython

# save model
model.save(cur_paper + "-w2v-model")

2019-01-16 22:18:20,188 : INFO : collecting all words and their counts
2019-01-16 22:18:20,189 : INFO : Opening paper NationalAntiSlaveryStandard
2019-01-16 22:18:20,387 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-16 22:18:58,271 : INFO : collected 117261 word types from a corpus of 21251210 raw words and 8905 sentences
2019-01-16 22:18:58,272 : INFO : Loading a fresh vocabulary
2019-01-16 22:18:58,386 : INFO : min_count=5 retains 43767 unique words (37% of original 117261, drops 73494)
2019-01-16 22:18:58,386 : INFO : min_count=5 leaves 21134645 word corpus (99% of original 21251210, drops 116565)
2019-01-16 22:18:58,481 : INFO : deleting the raw counts dictionary of 117261 items
2019-01-16 22:18:58,484 : INFO : sample=0.001 downsamples 44 most-common words
2019-01-16 22:18:58,485 : INFO : downsampling leaves estimated 15688728 word corpus (74.2% of prior 21134645)
2019-01-16 22:18:58,585 : INFO : estimated required memory for 43767 words and 100

2019-01-16 22:19:51,378 : INFO : EPOCH 2 - PROGRESS: at 53.80% examples, 453151 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:19:52,389 : INFO : EPOCH 2 - PROGRESS: at 56.96% examples, 453673 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:19:53,400 : INFO : EPOCH 2 - PROGRESS: at 59.52% examples, 453847 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:19:54,420 : INFO : EPOCH 2 - PROGRESS: at 63.03% examples, 454106 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:19:55,433 : INFO : EPOCH 2 - PROGRESS: at 65.54% examples, 454212 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:19:56,438 : INFO : EPOCH 2 - PROGRESS: at 68.61% examples, 454209 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:19:57,441 : INFO : EPOCH 2 - PROGRESS: at 71.48% examples, 454406 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:19:58,455 : INFO : EPOCH 2 - PROGRESS: at 74.35% examples, 453953 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:19:59,460 : INFO : EPOCH 2 - PROGRESS: at 77.17% examples, 453912 words/s, in_qsiz

2019-01-16 22:20:45,804 : INFO : EPOCH 4 - PROGRESS: at 11.65% examples, 455771 words/s, in_qsize 1, out_qsize 0
2019-01-16 22:20:46,810 : INFO : EPOCH 4 - PROGRESS: at 14.87% examples, 453016 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:20:47,811 : INFO : EPOCH 4 - PROGRESS: at 17.93% examples, 452394 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:20:48,812 : INFO : EPOCH 4 - PROGRESS: at 20.62% examples, 451772 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:20:49,812 : INFO : EPOCH 4 - PROGRESS: at 23.17% examples, 452171 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:20:50,826 : INFO : EPOCH 4 - PROGRESS: at 26.69% examples, 452954 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:20:51,828 : INFO : EPOCH 4 - PROGRESS: at 29.40% examples, 454192 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:20:52,833 : INFO : EPOCH 4 - PROGRESS: at 32.50% examples, 454485 words/s, in_qsize 0, out_qsize 0
2019-01-16 22:20:53,834 : INFO : EPOCH 4 - PROGRESS: at 35.78% examples, 455232 words/s, in_qsiz

2019-01-16 22:21:49,351 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-01-16 22:21:49,352 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-01-16 22:21:49,352 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-01-16 22:21:49,353 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-01-16 22:21:49,354 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-01-16 22:21:49,355 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-01-16 22:21:49,356 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-16 22:21:49,357 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-01-16 22:21:49,358 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-01-16 22:21:49,363 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-16 22:21:49,364 : INFO : EPOCH - 5 : training on 21251210 raw words (15416426 effe

In [None]:
## LK TO HERE TODAY
cur_paper = "ProvincialFreeman"
articles = MyArticles(base_dir, cur_paper)

# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=5, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=10) # parallel processing; needs Cython

# save model
model.save(cur_paper + "-w2v-model")

In [20]:
# testing some basic functions

# basic similarity
w1 = "freedom"
model.wv.most_similar(positive=w1)

[('great', 0.9999532699584961),
 ('most', 0.9999502897262573),
 ('against', 0.9999476671218872),
 ('with', 0.9999455213546753),
 ('like', 0.9999449253082275),
 ('without', 0.9999438524246216),
 ('among', 0.9999430179595947),
 ('being', 0.9999424815177917),
 ('into', 0.9999416470527649),
 ('an', 0.999941349029541)]

In [22]:
# two word similarity 

model.wv.similarity(w1="freedom",w2="justice")

0.9998831061190858

In [25]:
model.wv.similarity(w1="freedom",w2="abolition")

0.9974367023618899

In [26]:
model.wv.similarity(w1="freedom",w2="emancipation")

0.9996274907535606

In [27]:
model.wv.similarity(w1="freedom",w2="liberation")

0.9145890133346956

In [29]:
# opposite words
model.wv.most_similar(positive=["freedom","emancipation"], negative=["slavery"])

[('they', 0.9811407923698425),
 ('you', 0.980480432510376),
 ('mentioned', 0.9800600409507751),
 ('us', 0.9799489974975586),
 ('beach', 0.979884147644043),
 ('have', 0.9798733592033386),
 ('would', 0.9797930717468262),
 ('information', 0.979749858379364),
 ('taught', 0.979706883430481),
 ('longer', 0.9796891212463379)]