In [1]:
import gensim
import logging
import os
import re
import string

In [2]:
# basic logging setup
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
base_dir = "ccp-corpus-0.2-small"

In [4]:
class MyMeetings(object):
    def __init__(self, base_dir):
        self.base_dir = base_dir
                                             
    def __iter__(self):
        meetings = os.listdir(self.base_dir)
        
        for meeting in meetings:
            if not meeting.startswith('.'):
                logging.info("Opening meeting {0}".format(meeting))
                meetingFile = open(base_dir + "/" + meeting, "r")
                meetingText = meetingFile.read()
                meetingFile.close()
                    
                # create word list for the meeting; could refine to be sentences later 
                meetingWords = []
               
                # ignore single-char words and words with numbers in them                        
                for word in re.split('\W+', meetingText):
                    if len(word) > 1 and not any(char.isdigit() for char in word):
                        # lowercase and add to list
                        meetingWords.append(word.lower())
                        
                yield meetingWords

In [5]:
# get docs into the list of list formas

meetings = MyMeetings(base_dir)
print("Created list of meetings.")

Created list of meetings.


In [6]:
# build vocab and train model
model = gensim.models.Word2Vec(
    meetings,
    min_count=2, # default is 5; this trims the corpus for words only used once; up to 100 is OK 
    size=200, # size of NN layers; default is 100; higher for larger corpora
    workers=5) # parallel processing; needs Cython

2019-01-16 14:46:59,677 : INFO : collecting all words and their counts
2019-01-16 14:46:59,679 : INFO : Opening meeting 1848.PA-12.13.HARR.txt
2019-01-16 14:46:59,693 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-16 14:46:59,695 : INFO : Opening meeting 1832.PA-06.04.PHIL.txt
2019-01-16 14:46:59,713 : INFO : Opening meeting 1855.CT-04-18.HART.txt
2019-01-16 14:46:59,715 : INFO : Opening meeting 1865.NC-11.29.RALE.txt
2019-01-16 14:46:59,728 : INFO : Opening meeting 1863.NY-07.16.POUG.txt
2019-01-16 14:46:59,736 : INFO : Opening meeting 1844.NY-09.18.SCHE-Report.txt
2019-01-16 14:46:59,741 : INFO : Opening meeting 1865.CA-10.25.SACR.txt
2019-01-16 14:46:59,787 : INFO : Opening meeting 1841.ME-10.06.PORT.txt
2019-01-16 14:46:59,796 : INFO : Opening meeting 1851.NY-07.22.ALBA.txt
2019-01-16 14:46:59,829 : INFO : Opening meeting 1864.NY-10.04.SYRA.txt
2019-01-16 14:46:59,857 : INFO : Opening meeting 1843.MI-10.26.DETR.txt
2019-01-16 14:46:59,879 : INFO 

2019-01-16 14:47:01,676 : INFO : Opening meeting 1849.CT-09.12.NEWH.txt
2019-01-16 14:47:01,693 : INFO : Opening meeting 1851.OH-01.15.COLU.txt
2019-01-16 14:47:01,724 : INFO : Opening meeting 1865.NC-09.29.RALE.txt
2019-01-16 14:47:01,760 : INFO : Opening meeting 1857.CA-10.20.SANF.txt
2019-01-16 14:47:01,777 : INFO : Opening meeting 1854.MA-01.02.BOST.txt
2019-01-16 14:47:01,788 : INFO : Opening meeting 1865.CT-06.06.NEWH.01.txt
2019-01-16 14:47:01,797 : INFO : Opening meeting 1850.OH-01.09.COLU.txt
2019-01-16 14:47:01,813 : INFO : Opening meeting 1830.PA-09.24.PHIL.txt
2019-01-16 14:47:01,822 : INFO : Opening meeting 1858.MA-08.01.NEWB.txt
2019-01-16 14:47:01,857 : INFO : Opening meeting 1841.ME-10.06.PORT.01.txt
2019-01-16 14:47:01,861 : INFO : Opening meeting 1865.NC-09.29.RALE.02.txt
2019-01-16 14:47:01,879 : INFO : Opening meeting 1858.NY-09.14.TROY.txt
2019-01-16 14:47:01,884 : INFO : Opening meeting 1841.ME-10.06.PORT.02.txt
2019-01-16 14:47:01,894 : INFO : Opening meeting 184

2019-01-16 14:47:03,209 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-01-16 14:47:03,213 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-16 14:47:03,213 : INFO : EPOCH - 2 : training on 419914 raw words (271022 effective words) took 1.0s, 262501 effective words/s
2019-01-16 14:47:03,220 : INFO : Opening meeting 1848.PA-12.13.HARR.txt
2019-01-16 14:47:03,233 : INFO : Opening meeting 1832.PA-06.04.PHIL.txt
2019-01-16 14:47:03,247 : INFO : Opening meeting 1855.CT-04-18.HART.txt
2019-01-16 14:47:03,251 : INFO : Opening meeting 1865.NC-11.29.RALE.txt
2019-01-16 14:47:03,257 : INFO : Opening meeting 1863.NY-07.16.POUG.txt
2019-01-16 14:47:03,265 : INFO : Opening meeting 1844.NY-09.18.SCHE-Report.txt
2019-01-16 14:47:03,270 : INFO : Opening meeting 1865.CA-10.25.SACR.txt
2019-01-16 14:47:03,293 : INFO : Opening meeting 1841.ME-10.06.PORT.txt
2019-01-16 14:47:03,298 : INFO : Opening meeting 1851.NY-07.22.ALBA.txt
2019-01-16 14:47:03,322 : IN

2019-01-16 14:47:04,936 : INFO : Opening meeting 1865.NC-09.29.RALE.txt
2019-01-16 14:47:04,948 : INFO : Opening meeting 1857.CA-10.20.SANF.txt
2019-01-16 14:47:04,962 : INFO : Opening meeting 1854.MA-01.02.BOST.txt
2019-01-16 14:47:04,969 : INFO : Opening meeting 1865.CT-06.06.NEWH.01.txt
2019-01-16 14:47:04,971 : INFO : Opening meeting 1850.OH-01.09.COLU.txt
2019-01-16 14:47:04,986 : INFO : Opening meeting 1830.PA-09.24.PHIL.txt
2019-01-16 14:47:04,996 : INFO : Opening meeting 1858.MA-08.01.NEWB.txt
2019-01-16 14:47:05,014 : INFO : Opening meeting 1841.ME-10.06.PORT.01.txt
2019-01-16 14:47:05,020 : INFO : Opening meeting 1865.NC-09.29.RALE.02.txt
2019-01-16 14:47:05,039 : INFO : Opening meeting 1858.NY-09.14.TROY.txt
2019-01-16 14:47:05,045 : INFO : Opening meeting 1841.ME-10.06.PORT.02.txt
2019-01-16 14:47:05,062 : INFO : Opening meeting 1847.NY-11.06.TROY.txt
2019-01-16 14:47:05,083 : INFO : Opening meeting 1854.CT-09.27.MIDD.txt
2019-01-16 14:47:05,089 : INFO : Opening meeting 185

2019-01-16 14:47:06,508 : INFO : EPOCH - 5 : training on 419914 raw words (270809 effective words) took 1.1s, 241056 effective words/s
2019-01-16 14:47:06,508 : INFO : training on a 2099570 raw words (1354872 effective words) took 5.5s, 246895 effective words/s


In [7]:
# save model
model.save(base_dir + "-w2v-model")

2019-01-16 14:47:13,203 : INFO : saving Word2Vec object under ccp-corpus-0.2-small-w2v-model, separately None
2019-01-16 14:47:13,204 : INFO : not storing attribute vectors_norm
2019-01-16 14:47:13,205 : INFO : not storing attribute cum_table
2019-01-16 14:47:13,380 : INFO : saved ccp-corpus-0.2-small-w2v-model


In [8]:
# testing some basic functions

# basic similarity
w1 = "freedom"
model.wv.most_similar(positive=w1)

2019-01-16 14:47:15,292 : INFO : precomputing L2-norms of word weight vectors


[('spirit', 0.9908210039138794),
 ('pursuit', 0.9882231950759888),
 ('happiness', 0.9881479740142822),
 ('human', 0.9861965179443359),
 ('influence', 0.9859877824783325),
 ('humanity', 0.9858787059783936),
 ('justice', 0.9834567308425903),
 ('character', 0.9819958209991455),
 ('institutions', 0.9817777872085571),
 ('life', 0.9799968004226685)]

In [9]:
# two word similarity 

model.wv.similarity(w1="freedom",w2="justice")

0.9834567804516825

In [10]:
model.wv.similarity(w1="freedom",w2="abolition")

0.928029651118336

In [11]:
model.wv.similarity(w1="freedom",w2="emancipation")

0.912782890734075

In [12]:
model.wv.similarity(w1="freedom",w2="liberation")

0.8475150510359857

In [13]:
# opposite words
model.wv.most_similar(positive=["freedom","emancipation"], negative=["slavery"])

[('devotion', 0.9780879020690918),
 ('abiding', 0.977773904800415),
 ('philanthropic', 0.9756821393966675),
 ('fortunes', 0.9749962687492371),
 ('cultivating', 0.9718132019042969),
 ('volunteered', 0.9711169004440308),
 ('willingness', 0.9706360697746277),
 ('ashes', 0.9696649312973022),
 ('perils', 0.9685608744621277),
 ('defending', 0.9684313535690308)]