In [6]:
import gensim
import logging
import os
import re
import string

In [38]:
# basic logging setup
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
base_dir = "accessible-v4.0-small"

In [11]:
class MyArticles(object):
    def __init__(self, base_dir):
        self.base_dir = base_dir
                                             
    def __iter__(self):
        papers = os.listdir(self.base_dir)
        
        for paper in papers:
            if not paper.startswith('.'):
                logging.info("Opening paper {0}".format(paper))
        
                issues = os.listdir(base_dir + "/" + paper)
        
                for issue in issues:
                    if not issue.startswith('.'):
                        #logging.info("Opening issue {0}".format(issue))
                        articles = os.listdir(base_dir + "/" + paper + "/" + issue)

                        for article in articles:
                            if not article.startswith('.'):
                               # logging.info("Reading article {0}".format(article))
                            
                                articleFile = open(base_dir + "/" + paper + "/" + issue + "/" + article, "r")
                                articleText = articleFile.read()
                                articleFile.close()
                    
                                # create word list for the article; could refine to be sentences later 
                                articleWords = []
               
                                # ignore single-char words and words with numbers in them                        
                                for word in re.split('\W+', articleText):
                                    if len(word) > 1 and not any(char.isdigit() for char in word):
                                        # lowercase and add to list
                                        articleWords.append(word.lower())
                        
                                yield articleWords

In [12]:
# get docs into the list of list formas

articles = MyArticles(base_dir)

Created list of articles.


In [13]:
# build vocab and train model
model = gensim.models.Word2Vec(
    articles,
    min_count=2, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=100, # size of NN layers; default is 100; higher for larger corpora
    workers=10) # parallel processing; needs Cython

# save model
model.save(base_dir + "-w2v-model")

In [20]:
# testing some basic functions

# basic similarity
w1 = "freedom"
model.wv.most_similar(positive=w1)

[('great', 0.9999532699584961),
 ('most', 0.9999502897262573),
 ('against', 0.9999476671218872),
 ('with', 0.9999455213546753),
 ('like', 0.9999449253082275),
 ('without', 0.9999438524246216),
 ('among', 0.9999430179595947),
 ('being', 0.9999424815177917),
 ('into', 0.9999416470527649),
 ('an', 0.999941349029541)]

In [22]:
# two word similarity 

model.wv.similarity(w1="freedom",w2="justice")

0.9998831061190858

In [25]:
model.wv.similarity(w1="freedom",w2="abolition")

0.9974367023618899

In [26]:
model.wv.similarity(w1="freedom",w2="emancipation")

0.9996274907535606

In [27]:
model.wv.similarity(w1="freedom",w2="liberation")

0.9145890133346956

In [29]:
# opposite words
model.wv.most_similar(positive=["freedom","emancipation"], negative=["slavery"])

[('they', 0.9811407923698425),
 ('you', 0.980480432510376),
 ('mentioned', 0.9800600409507751),
 ('us', 0.9799489974975586),
 ('beach', 0.979884147644043),
 ('have', 0.9798733592033386),
 ('would', 0.9797930717468262),
 ('information', 0.979749858379364),
 ('taught', 0.979706883430481),
 ('longer', 0.9796891212463379)]