# wiki corpus - model training
collecting and processing text from wikipedia data dumps

In [1]:
import random
import os.path
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.models import translation_matrix
import gensim
import sys
from gensim.corpora import WikiCorpus
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import linear_model
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

## Step 2: randomize input order
for our experiment we would like to randomize the input text stream. <br>
we will take the first 100,000 articles and put each article to a seperate file. <br>
after this we could access each article directly, and we can input them in a randow order to our model

In [40]:
subdirectory = 'first100k'
out_f = 'enwiki_processed.txt'
out_f_shuffle = 'article_{}.txt'


corpus_file = open(out_f,'r')
for article_num in range(100000):
    output_shuffle_file = open(os.path.join(subdirectory,out_f_shuffle.format(article_num)), 'a+')
    article = corpus_file.readline()
    output_shuffle_file.write(article + '\n')
    output_shuffle_file.close()
corpus_file.close()

in order to not fill our RAM and crash our computer, let's first define an itterator for wiki articles

In [5]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
        self.file_list = [fname for fname in os.listdir(dirname) if fname.endswith('.txt')]
        random.shuffle(self.file_list)

    def __iter__(self):
        for article in self.file_list:
            for line in open(os.path.join(self.dirname, article)):
                yield line.split()
                
    def reshuffle(self):
        random.shuffle(self.file_list)

## Step 3: train models 
ok! now we can train the word embedding model

In [2]:
import gensim.downloader as api
wiki_corpus = api.load('wiki-english-20171001')
#corpus = api.load('text8')



2019-06-27 09:47:39,253:INFO:Part 1/4 downloaded




2019-06-27 09:54:39,090:INFO:Part 2/4 downloaded




2019-06-27 10:02:09,340:INFO:Part 3/4 downloaded




2019-06-27 10:03:30,081:INFO:Part 4/4 downloaded


In [4]:
wiki_corpus

TypeError: 'Dataset' object is not subscriptable

In [72]:
import gensim

sentences = MySentences(subdirectory) # a memory-friendly iterator
model = gensim.models.Word2Vec(min_count=20, size=100, workers=4)
model.build_vocab(sentences)
model.save('empty_model')

and now for the real training

In [73]:
subdirectory = 'first100k'
sentences = MySentences(subdirectory) # a memory-friendly iterator

for i in range(1,21):
    logging.info('starting model {}'.format(i))
    sentences.reshuffle()
    model = gensim.models.Word2Vec.load("empty_model")
    logging.info('model training {}'.format(i))
    model.train(sentences, total_examples=model.corpus_count, epochs=1)
    logging.info('saving model {}'.format(i))
    word_vectors = model.wv
    fname = 'WV_dir/WV{}.kv'
    word_vectors.save(fname.format(i))
    logging.info('finished model {}'.format(i))

136584684

let's check our model makes some intuitive sense

In [74]:
w1 = ["france"]
model.wv.most_similar (positive=w1,topn=6)


[('spain', 0.7594519257545471),
 ('belgium', 0.7488405704498291),
 ('bordeaux', 0.7018789052963257),
 ('portugal', 0.6989240646362305),
 ('italy', 0.6878964900970459),
 ('provence', 0.6737589836120605)]

In [75]:
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile

word_vectors = model.wv
fname = 'WV1.kv'
word_vectors.save(fname)