In [1]:
import multiprocessing
from gensim.models import Word2Vec 
from gensim.models.phrases import Phrases, Phraser

In [36]:
class Word2VecModelMaker:
  # intended to be used with clean corpus (no stopwords or special characters), joined into a single string with full stops left in
  def __init__(self, corpus):
    self.corpus = corpus
    sentences = corpus.split(".")
    # splitting corpus into sentences and removing very short sentences (won't have useful info on word relations)
    self.sentences = [sentence.strip() for sentence in sentences if len(sentence.strip().split(" ")) > 2]

  def make_model(self, vector_length, minimum_count, filename):
    cores = multiprocessing.cpu_count()

    # process to group together bigrams with distinct meanings
    words_list = [i.split() for i in self.sentences]
    phrases = Phrases(words_list, min_count=30, progress_per=10000)
    bigram = Phraser(phrases)
    sents = bigram[words_list]

    #setting model properties
    wtv_model = Word2Vec(min_count = minimum_count, 
                     window = 2,
                     size = vector_length,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores - 1)
    
    # creating and training the model as defined above
    wtv_model.build_vocab(sents, progress_per=10000)
    wtv_model.train(sents, total_examples=wtv_model.corpus_count, epochs=30, report_delay=1)
    # will save the model into current directory. if the model ends up being large there will be multiple files including .npy files
    wtv_model.save(filename + ".model")
  
  # extremely simplified model to give more differentiated end shapes in frontend
  def make_model_simple(self, filename="wordvec_simple"):
    self.make_model(vector_length=25, minimum_count=40, filename=filename)

  # here example of complex but not fully detailed model
  # but the default here intended to keep end json file of shape points relatively small
  def make_model_complex(self, filename="wordvec_complex"):
    self.make_model(vector_length=150, minimum_count=40, filename=filename)

  # standard detailed model, will have multiple model/npy files and end json file will be very large (example run was 500mb)
  # mostly for running locally
  def make_model_full(self, filename="wordvec"):
    self.make_model(vector_length=300, minimum_count=30, filename=filename)

In [37]:
# example of basic use:

# cleaned data loaded in from file, or raw data loaded then cleaned
# with open("cleaned_text.txt", "r") as file_:
#     corpus_cleaned = file_.read()

# # make object passing in cleaned corpus
# model_maker = Word2VecModelMaker(corpus_cleaned)
# # run whichever method fits best, should see the model file in directory in a few minutes depending on size
# model_maker.make_model_simple()