In [21]:
import logging
import multiprocessing
import os

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# Enable gensim logging
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)


class W2VLossLogger(CallbackAny2Vec):
    """Callback to print loss after each epoch
    use by passing model.train(..., callbacks=[W2VLossLogger()])
    """

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


def train_w2v_model(
    sentences,
    output_file,
    window,
    embedding_dim,
    epochs,
    min_word_count,
):
    

    """Train a word2vec model based on given sentences.
    Args:
        sentences list[list[str]]: List of sentences. Each element contains a list with the words
            in the current sentence
        output_file (str): Path to save the trained w2v model
        window (int): w2v context size
        embedding_dim (int): w2v vector dimension
        epochs (int): How many epochs should the training run
        min_word_count (int): Ignore words that appear less than min_word_count times
    """
    workers = multiprocessing.cpu_count()
    
    # TODO: Instantiate gensim.models.Word2Vec class
    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, window=window, min_count=min_word_count, workers=multiprocessing.cpu_count())
    model.build_vocab(sentences, progress_per=10000)
    # TODO: Build model vocabulary using sentences
    # TODO: Train word2vec model
    model.train(sentences, total_examples=model.corpus_count,epochs=epochs)
    # Save trained model
    model.save(output_file)
    # model.save(output_file)

    

    return model



    # read data/gutenberg.txt in the expected format
    f=open("tokenized.txt","r")
    sentences =eval(f.read())
    
   
    output_file = "gutenberg_w2v.hundd.model"
    window = 5
    embedding_dim = 100
    epochs = 1000
    min_word_count = 1

    gutenberg_w2v =train_w2v_model(
        sentences,
        output_file,
        window,
        embedding_dim,
        epochs,
        min_word_count)

    




In [None]:
   # read data/gutenberg.txt in the expected format
    f=open("tokenized.txt","r")
    sentences =eval(f.read())
    
   
    output_file = "gutenberg_w2v.hundd.model"
    window = 5
    embedding_dim = 100
    epochs = 1000
    min_word_count = 1

    gutenberg_w2v =train_w2v_model(
        sentences,
        output_file,
        window,
        embedding_dim,
        epochs,
        min_word_count)


In [3]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

gutenberg_w2v = Word2Vec.load("gutenberg_w2v.hundd.model")

In [43]:
gutenberg_w2v.wv.most_similar(["bible"])


[('wrist', 0.4070826470851898),
 ('buzite', 0.3894822895526886),
 ('dreamer', 0.3729250133037567),
 ('cobbler', 0.35319337248802185),
 ('cable', 0.35270583629608154),
 ('cassius', 0.3514133393764496),
 ('auger', 0.3499419093132019),
 ('priesthood', 0.34642016887664795),
 ('loe', 0.34116289019584656),
 ('headship', 0.3361479640007019)]

In [42]:
gutenberg_w2v.wv.most_similar(["book"])


[('xxiii', 0.5053890943527222),
 ('xxxi', 0.5027292370796204),
 ('xxix', 0.5025994777679443),
 ('xxviii', 0.5024734139442444),
 ('xxv', 0.4990857243537903),
 ('xxvii', 0.4965112507343292),
 ('xxvi', 0.4925084710121155),
 ('raze', 0.490346759557724),
 ('temple', 0.4875237047672272),
 ('written', 0.4845070540904999)]

In [40]:
gutenberg_w2v.wv.most_similar(["bank"])


[('table', 0.5287594199180603),
 ('wall', 0.5040320754051208),
 ('ground', 0.5038182735443115),
 ('top', 0.5032145977020264),
 ('floor', 0.4864599108695984),
 ('side', 0.4803113639354706),
 ('bed', 0.450342059135437),
 ('hill', 0.4464374780654907),
 ('river', 0.4414878189563751),
 ('pool', 0.4404714107513428)]

In [39]:
gutenberg_w2v.wv.most_similar(["water"])

[('waters', 0.6542330384254456),
 ('wine', 0.5172027945518494),
 ('river', 0.5064811110496521),
 ('hole', 0.49640560150146484),
 ('fire', 0.4884592294692993),
 ('wood', 0.4813365340232849),
 ('rivers', 0.47593218088150024),
 ('blood', 0.4702102243900299),
 ('ground', 0.46649661660194397),
 ('fowls', 0.46397456526756287)]

In [63]:
v = gutenberg_w2v.wv["good"] - gutenberg_w2v.wv["taller"] + gutenberg_w2v.wv["tall"]
gutenberg_w2v.wv.most_similar(v)

[('good', 0.48668214678764343),
 ('sandals', 0.4110839068889618),
 ('tall', 0.40284034609794617),
 ('tout', 0.3939869999885559),
 ('festive', 0.37680643796920776),
 ('handsome', 0.3653698265552521),
 ('salamander', 0.3586299419403076),
 ('test', 0.35852161049842834),
 ('renderest', 0.3573521673679352),
 ('paire', 0.35532742738723755)]

In [None]:
v = gutenberg_w2v.wv["grils"] - gutenberg_w2v.wv["kings"] + gutenberg_w2v.wv["queens"]
gutenberg_w2v.wv.most_similar(v)

In [62]:
v = gutenberg_w2v.wv["france"] - gutenberg_w2v.wv["paris"] + gutenberg_w2v.wv["london"]
gutenberg_w2v.wv.most_similar(v)

[('france', 0.6633819937705994),
 ('london', 0.5153282284736633),
 ('inferiorities', 0.39137589931488037),
 ('plan', 0.3810945153236389),
 ('highbury', 0.380890429019928),
 ('species', 0.38061437010765076),
 ('country', 0.369165301322937),
 ('siam', 0.3610007166862488),
 ('allusion', 0.35697129368782043),
 ('ambassadors', 0.35658982396125793)]

In [2]:
from gensim.models import KeyedVectors
google_model = KeyedVectors.load_word2vec_format('/home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', binary=True,
limit=1000000)

In [38]:
google_model.most_similar(["bible"])

[('Bible', 0.736778199672699),
 ('bibles', 0.6052598357200623),
 ('Holy_Bible', 0.5989601612091064),
 ('scriptures', 0.574568510055542),
 ('scripture', 0.5697901844978333),
 ('New_Testament', 0.5638793110847473),
 ('Scripture', 0.5502957701683044),
 ('Scriptures', 0.5411645770072937),
 ('NRSV', 0.5341106057167053),
 ('Leviticus_##:##-##', 0.5247005224227905)]

In [37]:
google_model.most_similar(["book"])

[('tome', 0.7485830783843994),
 ('books', 0.7379177808761597),
 ('memoir', 0.7302926778793335),
 ('paperback_edition', 0.6868364214897156),
 ('autobiography', 0.6741527318954468),
 ('memoirs', 0.6505153179168701),
 ('Book', 0.6479282975196838),
 ('paperback', 0.6471226811408997),
 ('novels', 0.6341459155082703),
 ('hardback', 0.6283079981803894)]

In [36]:
google_model.most_similar(["bank"])

[('banks', 0.7440759539604187),
 ('banking', 0.690161406993866),
 ('Bank', 0.6698698401451111),
 ('lender', 0.6342284679412842),
 ('banker', 0.6092953085899353),
 ('depositors', 0.6031531691551208),
 ('mortgage_lender', 0.5797975659370422),
 ('depositor', 0.5716427564620972),
 ('BofA', 0.5714625120162964),
 ('Citibank', 0.5589520335197449)]

In [13]:
google_model.most_similar(["cwater"])

[('coochie', 0.6906902194023132),
 ('vajayjay', 0.6699857711791992),
 ('p_*_ssy', 0.6656262278556824),
 ('tushy', 0.6618760824203491),
 ('titties', 0.654757022857666),
 ('d_**_k', 0.6432459950447083),
 ('Ewwwww', 0.6393193602561951),
 ('dangly_bits', 0.6388564109802246),
 ('pubes', 0.6357187628746033),
 ('Urgh', 0.6354457139968872)]

In [74]:
v = google_model["girls"] - google_model["queen"] + google_model["kings"]
google_model.most_similar(v)

[('boys', 0.6931698322296143),
 ('girls', 0.6385126709938049),
 ('kings', 0.4957888424396515),
 ('men', 0.48680540919303894),
 ('teenagers', 0.4788475036621094),
 ('schoolboys', 0.45804113149642944),
 ('pee_wees', 0.44774994254112244),
 ('Mitey_Mite', 0.44012460112571716),
 ('kids', 0.4373849332332611),
 ('youngsters', 0.43566834926605225)]

In [72]:
v = google_model["good"] - google_model["taller"] + google_model["tall"]
google_model.most_similar(v)

[('good', 0.6434131860733032),
 ('great', 0.49164238572120667),
 ('bad', 0.4760521948337555),
 ('terrific', 0.46986129879951477),
 ('wonderful', 0.4452008605003357),
 ('nice', 0.4425136148929596),
 ('fantastic', 0.43418607115745544),
 ('decent', 0.4307934641838074),
 ('excellent', 0.41867733001708984),
 ('terrible', 0.4151829481124878)]

In [70]:
v = google_model["france"] - google_model["paris"] + google_model["london"]
google_model.most_similar(v)

[('london', 0.754153847694397),
 ('france', 0.7366582751274109),
 ('england', 0.600825309753418),
 ('europe', 0.5708170533180237),
 ('birmingham', 0.5392330884933472),
 ('european', 0.5275605916976929),
 ('newcastle', 0.5263600945472717),
 ('barcelona', 0.5107599496841431),
 ('africa', 0.510517418384552),
 ('spain', 0.5082812905311584)]

In [4]:
import numpy as np

voc = gutenberg_w2v.wv.index_to_key
# get vector size
dim = gutenberg_w2v.vector_size


# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):  
    embedding_matrix = np.zeros((len(voc), model.vector_size))
    word2idx = {}
    for i in range(len(voc)):
        embedding_matrix[i] = model.wv[model.wv.index_to_key[i]] 
    return embedding_matrix


embeddings=to_embeddings_Matrix(gutenberg_w2v)


(41465, 100)


In [15]:
# Put it in data later
import csv
with open('../data/embeddings.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\t')
    for embedding in embeddings:
        tsv_output.writerow(embedding)
    
with open('../data/metadata.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output)
    for voc_rows in voc:
        tsv_output.writerow([voc_rows])
  

In [104]:


import glob
import os
import re

import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

SCRIPT_DIRECTORY = os.path.realpath(os.getcwd())

data_dir = os.path.join(SCRIPT_DIRECTORY, "../data/aclImdb")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
pos_train_dir = os.path.join(train_dir, "pos")
neg_train_dir = os.path.join(train_dir, "neg")
pos_test_dir = os.path.join(test_dir, "pos")
neg_test_dir = os.path.join(test_dir, "neg")

# For memory limitations. These parameters fit in 8GB of RAM.
# If you have 16G of RAM you can experiment with the full dataset / W2V
MAX_NUM_SAMPLES = 5000
# Load first 1M word embeddings. This works because GoogleNews are roughly
# sorted from most frequent to least frequent.
# It may yield much worse results for other embeddings corpora
NUM_W2V_TO_LOAD = 1000000


SEED = 42

# Fix numpy random seed for reproducibility
np.random.seed(SEED)


def strip_punctuation(s):
    return re.sub(r"[^a-zA-Z\s]", " ", s)


def preprocess(s):
    return re.sub("\s+", " ", strip_punctuation(s).lower())


def tokenize(s):
    return s.split(" ")


def preproc_tok(s):
    return tokenize(preprocess(s))


def token_proc(t_corpus):
    data=[]
    for i,ind in enumerate(t_corpus):
        proc_t_corpus=preproc_tok(train_corpus[i])
        data.append(proc_t_corpus)
    
    return data


def read_samples(folder, preprocess=lambda x: x):
    samples = glob.iglob(os.path.join(folder, "*.txt"))
    data = []

    for i, sample in enumerate(samples):
        if MAX_NUM_SAMPLES > 0 and i == MAX_NUM_SAMPLES:
            break
        with open(sample, "r") as fd:
            x = [preprocess(l) for l in fd][0]
            data.append(x)

    return data


def create_corpus(pos, neg):
    corpus = np.array(pos + neg)
    y = np.array([1 for _ in pos] + [0 for _ in neg])
    indices = np.arange(y.shape[0])
    np.random.shuffle(indices)

    return list(corpus[indices]), list(y[indices])


def extract_nbow(model,train_data,test_data):
    """Extract neural bag of words representations"""
    X_train = np.zeros((np.size(train_data), 100))
    for row, rev in enumerate(train_data):
        words_included = 0

        rev_toks = preproc_tok(rev)
    
        for tok in rev_toks:
            if tok in model.wv:
                X_train[row] += model.wv[tok]
                words_included += 1
            
        # Get the mean value
        X_train[row] = X_train[row]/words_included




    X_test = np.zeros((np.size(test_data), 100)) 
    for row, rev in enumerate(test_data):
        words_included = 0
        # Tokenize current review
        rev_toks = preproc_tok(rev)
        for tok in rev_toks:
            # For each token check if it has a w2v representation
            # and if yes add it.
            if tok in model.wv:
                X_test[row] += model.wv[tok]
                words_included += 1
        # Get the mean value
        X_test[row] = X_test[row]/words_included

    return X_train,X_test

    raise NotImplementedError("Implement nbow extractor")


def train_sentiment_analysis(train_corpus, train_labels):
    """Train a sentiment analysis classifier using NBOW + Logistic regression"""
    raise NotImplementedError("Implement sentiment analysis training")


def evaluate_sentiment_analysis(classifier, test_corpus, test_labels):
    """Evaluate classifier in the test corpus and report accuracy"""
    raise NotImplementedError("Implement sentiment analysis evaluation")


if __name__ == "__main__":
    # TODO: read Imdb corpus
    pos_train=read_samples(pos_train_dir)
    neg_train=read_samples(neg_train_dir)
    
    
    pos_test=read_samples(pos_test_dir)
    neg_test=read_samples(neg_test_dir)
    
    corpus,labels = create_corpus(pos_train,neg_train)
#     (
#             train_corpus,
#             test_corpus,
#             train_labels,
#             test_labels,
#     ) = sklearn.model_selection.train_test_split(corpus, labels)

        # TODO: train / evaluate and report accuracy


In [6]:
# def token_proc(t_corpus):
#     data=[]
#     for i,ind in enumerate(t_corpus):
#         proc_t_corpus=preproc_tok(train_corpus[i])
#         data.append(proc_t_corpus)
    
#     return data
        
        



In [162]:
# import pandas as pd


# df = pd.DataFrame (corpus)
# df.columns=['text']
# df.insert(1, "label", labels, True)
# print(df)

In [163]:


proc_tok_corpus=[]
proc_tok_rev=[]
for rev in corpus:
    proc_tok_rev=preproc_tok(rev)

    proc_tok_corpus.append(proc_tok_rev)


In [164]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

train_w2v_model(proc_tok_corpus,
   "my_sentiment_w2v.model",
    5,
    100,
    1000,
    1,
)

INFO - 10:48:04: collecting all words and their counts
INFO - 10:48:04: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 10:48:05: collected 51037 word types from a corpus of 2430609 raw words and 10000 sentences
INFO - 10:48:05: Creating a fresh vocabulary
INFO - 10:48:05: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 51037 unique words (100.00% of original 51037, drops 0)', 'datetime': '2023-04-06T10:48:05.614272', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 10:48:05: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 2430609 word corpus (100.00% of original 2430609, drops 0)', 'datetime': '2023-04-06T10:48:05.615184', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 10:48:05:

INFO - 10:48:28: EPOCH 8 - PROGRESS: at 72.40% examples, 1293648 words/s, in_qsize 13, out_qsize 2
INFO - 10:48:28: EPOCH 8: training on 2430609 raw words (1812400 effective words) took 1.4s, 1319112 effective words/s
INFO - 10:48:29: EPOCH 9 - PROGRESS: at 63.91% examples, 1134474 words/s, in_qsize 16, out_qsize 0
INFO - 10:48:30: EPOCH 9: training on 2430609 raw words (1813049 effective words) took 1.5s, 1213113 effective words/s
INFO - 10:48:31: EPOCH 10 - PROGRESS: at 73.63% examples, 1325112 words/s, in_qsize 16, out_qsize 1
INFO - 10:48:31: EPOCH 10: training on 2430609 raw words (1813195 effective words) took 1.4s, 1341593 effective words/s
INFO - 10:48:32: EPOCH 11 - PROGRESS: at 71.13% examples, 1281481 words/s, in_qsize 15, out_qsize 0
INFO - 10:48:33: EPOCH 11: training on 2430609 raw words (1813488 effective words) took 1.4s, 1291020 effective words/s
INFO - 10:48:34: EPOCH 12 - PROGRESS: at 67.04% examples, 1201289 words/s, in_qsize 14, out_qsize 1
INFO - 10:48:34: EPOCH 1

INFO - 10:49:22: EPOCH 45: training on 2430609 raw words (1813018 effective words) took 1.5s, 1240233 effective words/s
INFO - 10:49:23: EPOCH 46 - PROGRESS: at 64.40% examples, 1154625 words/s, in_qsize 14, out_qsize 1
INFO - 10:49:23: EPOCH 46: training on 2430609 raw words (1813393 effective words) took 1.5s, 1179880 effective words/s
INFO - 10:49:24: EPOCH 47 - PROGRESS: at 64.78% examples, 1157011 words/s, in_qsize 15, out_qsize 1
INFO - 10:49:25: EPOCH 47: training on 2430609 raw words (1813939 effective words) took 1.5s, 1186008 effective words/s
INFO - 10:49:26: EPOCH 48 - PROGRESS: at 62.68% examples, 1125048 words/s, in_qsize 15, out_qsize 0
INFO - 10:49:26: EPOCH 48: training on 2430609 raw words (1813695 effective words) took 1.5s, 1191013 effective words/s
INFO - 10:49:27: EPOCH 49 - PROGRESS: at 64.78% examples, 1156833 words/s, in_qsize 16, out_qsize 2
INFO - 10:49:28: EPOCH 49: training on 2430609 raw words (1813056 effective words) took 1.6s, 1136887 effective words/s


INFO - 10:50:23: EPOCH 82 - PROGRESS: at 71.12% examples, 1268678 words/s, in_qsize 14, out_qsize 1
INFO - 10:50:24: EPOCH 82: training on 2430609 raw words (1813474 effective words) took 1.4s, 1302207 effective words/s
INFO - 10:50:25: EPOCH 83 - PROGRESS: at 69.95% examples, 1242999 words/s, in_qsize 13, out_qsize 2
INFO - 10:50:25: EPOCH 83: training on 2430609 raw words (1812955 effective words) took 1.4s, 1266797 effective words/s
INFO - 10:50:26: EPOCH 84 - PROGRESS: at 64.29% examples, 1143755 words/s, in_qsize 16, out_qsize 0
INFO - 10:50:27: EPOCH 84: training on 2430609 raw words (1812896 effective words) took 1.6s, 1163845 effective words/s
INFO - 10:50:28: EPOCH 85 - PROGRESS: at 63.13% examples, 1131418 words/s, in_qsize 16, out_qsize 3
INFO - 10:50:28: EPOCH 85: training on 2430609 raw words (1813248 effective words) took 1.6s, 1142528 effective words/s
INFO - 10:50:29: EPOCH 86 - PROGRESS: at 68.69% examples, 1226746 words/s, in_qsize 14, out_qsize 1
INFO - 10:50:30: EPO

INFO - 10:51:21: EPOCH 119: training on 2430609 raw words (1813664 effective words) took 1.6s, 1105863 effective words/s
INFO - 10:51:22: EPOCH 120 - PROGRESS: at 49.29% examples, 869024 words/s, in_qsize 14, out_qsize 1
INFO - 10:51:23: EPOCH 120: training on 2430609 raw words (1812746 effective words) took 2.0s, 906713 effective words/s
INFO - 10:51:24: EPOCH 121 - PROGRESS: at 57.24% examples, 1024132 words/s, in_qsize 14, out_qsize 1
INFO - 10:51:25: EPOCH 121: training on 2430609 raw words (1813251 effective words) took 1.6s, 1108950 effective words/s
INFO - 10:51:26: EPOCH 122 - PROGRESS: at 63.55% examples, 1141057 words/s, in_qsize 15, out_qsize 0
INFO - 10:51:27: EPOCH 122: training on 2430609 raw words (1813421 effective words) took 1.5s, 1176691 effective words/s
INFO - 10:51:28: EPOCH 123 - PROGRESS: at 64.40% examples, 1152949 words/s, in_qsize 14, out_qsize 1
INFO - 10:51:28: EPOCH 123: training on 2430609 raw words (1813132 effective words) took 1.5s, 1187768 effective w

INFO - 10:52:20: EPOCH 156: training on 2430609 raw words (1812806 effective words) took 1.9s, 947533 effective words/s
INFO - 10:52:21: EPOCH 157 - PROGRESS: at 50.93% examples, 907006 words/s, in_qsize 15, out_qsize 0
INFO - 10:52:22: EPOCH 157: training on 2430609 raw words (1812401 effective words) took 1.8s, 995105 effective words/s
INFO - 10:52:23: EPOCH 158 - PROGRESS: at 57.65% examples, 1038371 words/s, in_qsize 14, out_qsize 1
INFO - 10:52:24: EPOCH 158: training on 2430609 raw words (1813873 effective words) took 1.9s, 979144 effective words/s
INFO - 10:52:25: EPOCH 159 - PROGRESS: at 57.24% examples, 1021582 words/s, in_qsize 15, out_qsize 0
INFO - 10:52:25: EPOCH 159: training on 2430609 raw words (1813012 effective words) took 1.6s, 1107478 effective words/s
INFO - 10:52:26: EPOCH 160 - PROGRESS: at 64.64% examples, 1156494 words/s, in_qsize 13, out_qsize 2
INFO - 10:52:27: EPOCH 160: training on 2430609 raw words (1813862 effective words) took 1.5s, 1201144 effective wor

INFO - 10:53:17: EPOCH 193: training on 2430609 raw words (1812435 effective words) took 1.6s, 1128101 effective words/s
INFO - 10:53:18: EPOCH 194 - PROGRESS: at 57.65% examples, 1038155 words/s, in_qsize 14, out_qsize 1
INFO - 10:53:19: EPOCH 194: training on 2430609 raw words (1813283 effective words) took 1.6s, 1147574 effective words/s
INFO - 10:53:20: EPOCH 195 - PROGRESS: at 63.49% examples, 1133464 words/s, in_qsize 14, out_qsize 1
INFO - 10:53:20: EPOCH 195: training on 2430609 raw words (1813028 effective words) took 1.5s, 1201155 effective words/s
INFO - 10:53:21: EPOCH 196 - PROGRESS: at 71.54% examples, 1281485 words/s, in_qsize 14, out_qsize 1
INFO - 10:53:21: EPOCH 196: training on 2430609 raw words (1812502 effective words) took 1.4s, 1272606 effective words/s
INFO - 10:53:22: EPOCH 197 - PROGRESS: at 74.41% examples, 1330046 words/s, in_qsize 13, out_qsize 2
INFO - 10:53:23: EPOCH 197: training on 2430609 raw words (1812918 effective words) took 1.4s, 1320916 effective

INFO - 10:54:13: EPOCH 230 - PROGRESS: at 56.80% examples, 1024712 words/s, in_qsize 15, out_qsize 0
INFO - 10:54:14: EPOCH 230: training on 2430609 raw words (1813492 effective words) took 1.7s, 1083094 effective words/s
INFO - 10:54:15: EPOCH 231 - PROGRESS: at 65.98% examples, 1183722 words/s, in_qsize 16, out_qsize 0
INFO - 10:54:15: EPOCH 231: training on 2430609 raw words (1812881 effective words) took 1.5s, 1242432 effective words/s
INFO - 10:54:16: EPOCH 232 - PROGRESS: at 68.66% examples, 1232151 words/s, in_qsize 14, out_qsize 1
INFO - 10:54:17: EPOCH 232: training on 2430609 raw words (1812969 effective words) took 1.4s, 1272799 effective words/s
INFO - 10:54:18: EPOCH 233 - PROGRESS: at 61.52% examples, 1101120 words/s, in_qsize 14, out_qsize 1
INFO - 10:54:18: EPOCH 233: training on 2430609 raw words (1812084 effective words) took 1.5s, 1197320 effective words/s
INFO - 10:54:19: EPOCH 234 - PROGRESS: at 66.37% examples, 1182804 words/s, in_qsize 13, out_qsize 2
INFO - 10:5

INFO - 10:55:14: EPOCH 267 - PROGRESS: at 69.54% examples, 1243239 words/s, in_qsize 16, out_qsize 1
INFO - 10:55:14: EPOCH 267: training on 2430609 raw words (1813113 effective words) took 1.5s, 1240399 effective words/s
INFO - 10:55:15: EPOCH 268 - PROGRESS: at 72.41% examples, 1292214 words/s, in_qsize 15, out_qsize 0
INFO - 10:55:16: EPOCH 268: training on 2430609 raw words (1813590 effective words) took 1.4s, 1313253 effective words/s
INFO - 10:55:17: EPOCH 269 - PROGRESS: at 67.04% examples, 1178589 words/s, in_qsize 16, out_qsize 1
INFO - 10:55:17: EPOCH 269: training on 2430609 raw words (1813055 effective words) took 1.6s, 1149748 effective words/s
INFO - 10:55:18: EPOCH 270 - PROGRESS: at 54.49% examples, 968046 words/s, in_qsize 15, out_qsize 0
INFO - 10:55:19: EPOCH 270: training on 2430609 raw words (1813444 effective words) took 1.8s, 1001457 effective words/s
INFO - 10:55:20: EPOCH 271 - PROGRESS: at 65.19% examples, 1155377 words/s, in_qsize 16, out_qsize 1
INFO - 10:55

INFO - 10:56:15: EPOCH 304 - PROGRESS: at 50.45% examples, 903850 words/s, in_qsize 13, out_qsize 2
INFO - 10:56:15: EPOCH 304: training on 2430609 raw words (1813075 effective words) took 1.8s, 986631 effective words/s
INFO - 10:56:16: EPOCH 305 - PROGRESS: at 65.54% examples, 1152120 words/s, in_qsize 16, out_qsize 0
INFO - 10:56:17: EPOCH 305: training on 2430609 raw words (1813864 effective words) took 1.7s, 1089721 effective words/s
INFO - 10:56:18: EPOCH 306 - PROGRESS: at 54.09% examples, 958166 words/s, in_qsize 14, out_qsize 1
INFO - 10:56:19: EPOCH 306: training on 2430609 raw words (1813236 effective words) took 1.8s, 1014202 effective words/s
INFO - 10:56:20: EPOCH 307 - PROGRESS: at 60.33% examples, 1061101 words/s, in_qsize 13, out_qsize 2
INFO - 10:56:21: EPOCH 307: training on 2430609 raw words (1812934 effective words) took 1.8s, 1021095 effective words/s
INFO - 10:56:22: EPOCH 308 - PROGRESS: at 64.26% examples, 1151981 words/s, in_qsize 15, out_qsize 0
INFO - 10:56:2

INFO - 10:57:11: EPOCH 341 - PROGRESS: at 68.69% examples, 1222922 words/s, in_qsize 15, out_qsize 0
INFO - 10:57:12: EPOCH 341: training on 2430609 raw words (1812650 effective words) took 1.4s, 1255984 effective words/s
INFO - 10:57:13: EPOCH 342 - PROGRESS: at 71.54% examples, 1285480 words/s, in_qsize 13, out_qsize 2
INFO - 10:57:13: EPOCH 342: training on 2430609 raw words (1813808 effective words) took 1.4s, 1293863 effective words/s
INFO - 10:57:14: EPOCH 343 - PROGRESS: at 70.74% examples, 1267135 words/s, in_qsize 13, out_qsize 2
INFO - 10:57:14: EPOCH 343: training on 2430609 raw words (1812601 effective words) took 1.4s, 1275275 effective words/s
INFO - 10:57:15: EPOCH 344 - PROGRESS: at 67.07% examples, 1207075 words/s, in_qsize 15, out_qsize 0
INFO - 10:57:16: EPOCH 344: training on 2430609 raw words (1812938 effective words) took 1.4s, 1266872 effective words/s
INFO - 10:57:17: EPOCH 345 - PROGRESS: at 74.35% examples, 1336609 words/s, in_qsize 15, out_qsize 0
INFO - 10:5

INFO - 10:58:10: EPOCH 377: training on 2430609 raw words (1813101 effective words) took 1.6s, 1105100 effective words/s
INFO - 10:58:11: EPOCH 378 - PROGRESS: at 61.94% examples, 1101552 words/s, in_qsize 14, out_qsize 1
INFO - 10:58:12: EPOCH 378: training on 2430609 raw words (1813571 effective words) took 1.6s, 1132568 effective words/s
INFO - 10:58:13: EPOCH 379 - PROGRESS: at 55.73% examples, 1002203 words/s, in_qsize 13, out_qsize 2
INFO - 10:58:14: EPOCH 379: training on 2430609 raw words (1812885 effective words) took 1.9s, 942113 effective words/s
INFO - 10:58:15: EPOCH 380 - PROGRESS: at 56.43% examples, 1014136 words/s, in_qsize 15, out_qsize 0
INFO - 10:58:16: EPOCH 380: training on 2430609 raw words (1813917 effective words) took 1.8s, 1025325 effective words/s
INFO - 10:58:17: EPOCH 381 - PROGRESS: at 53.74% examples, 964011 words/s, in_qsize 15, out_qsize 1
INFO - 10:58:18: EPOCH 381: training on 2430609 raw words (1813170 effective words) took 1.8s, 1008636 effective w

INFO - 10:59:13: EPOCH 414: training on 2430609 raw words (1813503 effective words) took 1.7s, 1093735 effective words/s
INFO - 10:59:14: EPOCH 415 - PROGRESS: at 60.30% examples, 1070895 words/s, in_qsize 14, out_qsize 1
INFO - 10:59:14: EPOCH 415: training on 2430609 raw words (1813339 effective words) took 1.6s, 1111993 effective words/s
INFO - 10:59:15: EPOCH 416 - PROGRESS: at 62.68% examples, 1124034 words/s, in_qsize 16, out_qsize 0
INFO - 10:59:16: EPOCH 416: training on 2430609 raw words (1813559 effective words) took 1.6s, 1127815 effective words/s
INFO - 10:59:17: EPOCH 417 - PROGRESS: at 58.94% examples, 1052778 words/s, in_qsize 16, out_qsize 0
INFO - 10:59:18: EPOCH 417: training on 2430609 raw words (1813009 effective words) took 1.8s, 1032353 effective words/s
INFO - 10:59:19: EPOCH 418 - PROGRESS: at 42.29% examples, 740867 words/s, in_qsize 15, out_qsize 0
INFO - 10:59:20: EPOCH 418 - PROGRESS: at 81.59% examples, 728737 words/s, in_qsize 14, out_qsize 2
INFO - 10:59:

INFO - 11:00:12: EPOCH 450 - PROGRESS: at 61.94% examples, 1096111 words/s, in_qsize 14, out_qsize 1
INFO - 11:00:12: EPOCH 450: training on 2430609 raw words (1812911 effective words) took 1.7s, 1084726 effective words/s
INFO - 11:00:13: EPOCH 451 - PROGRESS: at 53.74% examples, 956746 words/s, in_qsize 15, out_qsize 0
INFO - 11:00:14: EPOCH 451: training on 2430609 raw words (1812513 effective words) took 1.7s, 1058011 effective words/s
INFO - 11:00:15: EPOCH 452 - PROGRESS: at 53.31% examples, 943337 words/s, in_qsize 14, out_qsize 1
INFO - 11:00:16: EPOCH 452: training on 2430609 raw words (1812766 effective words) took 1.7s, 1036714 effective words/s
INFO - 11:00:17: EPOCH 453 - PROGRESS: at 49.25% examples, 867501 words/s, in_qsize 15, out_qsize 0
INFO - 11:00:18: EPOCH 453 - PROGRESS: at 98.09% examples, 878213 words/s, in_qsize 5, out_qsize 1
INFO - 11:00:18: EPOCH 453: training on 2430609 raw words (1812749 effective words) took 2.0s, 885753 effective words/s
INFO - 11:00:19: 

INFO - 11:01:14: EPOCH 482 - PROGRESS: at 69.47% examples, 1234485 words/s, in_qsize 14, out_qsize 1
INFO - 11:01:14: EPOCH 482: training on 2430609 raw words (1812277 effective words) took 1.5s, 1238505 effective words/s
INFO - 11:01:15: EPOCH 483 - PROGRESS: at 67.87% examples, 1221505 words/s, in_qsize 14, out_qsize 1
INFO - 11:01:16: EPOCH 483: training on 2430609 raw words (1812441 effective words) took 1.4s, 1258486 effective words/s
INFO - 11:01:17: EPOCH 484 - PROGRESS: at 72.82% examples, 1304473 words/s, in_qsize 15, out_qsize 0
INFO - 11:01:17: EPOCH 484: training on 2430609 raw words (1813226 effective words) took 1.4s, 1303848 effective words/s
INFO - 11:01:18: EPOCH 485 - PROGRESS: at 71.54% examples, 1287457 words/s, in_qsize 14, out_qsize 1
INFO - 11:01:18: EPOCH 485: training on 2430609 raw words (1812489 effective words) took 1.4s, 1316353 effective words/s
INFO - 11:01:19: EPOCH 486 - PROGRESS: at 71.56% examples, 1288343 words/s, in_qsize 14, out_qsize 1
INFO - 11:0

INFO - 11:02:06: EPOCH 519 - PROGRESS: at 72.39% examples, 1285136 words/s, in_qsize 14, out_qsize 1
INFO - 11:02:06: EPOCH 519: training on 2430609 raw words (1812997 effective words) took 1.4s, 1313103 effective words/s
INFO - 11:02:07: EPOCH 520 - PROGRESS: at 72.82% examples, 1297233 words/s, in_qsize 15, out_qsize 0
INFO - 11:02:08: EPOCH 520: training on 2430609 raw words (1812907 effective words) took 1.4s, 1321910 effective words/s
INFO - 11:02:09: EPOCH 521 - PROGRESS: at 71.56% examples, 1287749 words/s, in_qsize 14, out_qsize 1
INFO - 11:02:09: EPOCH 521: training on 2430609 raw words (1813129 effective words) took 1.4s, 1311744 effective words/s
INFO - 11:02:10: EPOCH 522 - PROGRESS: at 70.22% examples, 1252377 words/s, in_qsize 13, out_qsize 2
INFO - 11:02:10: EPOCH 522: training on 2430609 raw words (1813097 effective words) took 1.4s, 1269318 effective words/s
INFO - 11:02:11: EPOCH 523 - PROGRESS: at 70.22% examples, 1266591 words/s, in_qsize 14, out_qsize 1
INFO - 11:0

INFO - 11:03:00: EPOCH 556 - PROGRESS: at 46.20% examples, 814691 words/s, in_qsize 14, out_qsize 1
INFO - 11:03:01: EPOCH 556 - PROGRESS: at 93.34% examples, 838383 words/s, in_qsize 15, out_qsize 0
INFO - 11:03:02: EPOCH 556: training on 2430609 raw words (1813754 effective words) took 2.1s, 850032 effective words/s
INFO - 11:03:03: EPOCH 557 - PROGRESS: at 56.87% examples, 1023694 words/s, in_qsize 14, out_qsize 1
INFO - 11:03:03: EPOCH 557: training on 2430609 raw words (1812946 effective words) took 1.6s, 1109181 effective words/s
INFO - 11:03:04: EPOCH 558 - PROGRESS: at 68.70% examples, 1230733 words/s, in_qsize 15, out_qsize 1
INFO - 11:03:05: EPOCH 558: training on 2430609 raw words (1813217 effective words) took 1.5s, 1194221 effective words/s
INFO - 11:03:06: EPOCH 559 - PROGRESS: at 61.55% examples, 1096899 words/s, in_qsize 13, out_qsize 2
INFO - 11:03:06: EPOCH 559: training on 2430609 raw words (1813303 effective words) took 1.7s, 1087533 effective words/s
INFO - 11:03:0

INFO - 11:04:01: EPOCH 591 - PROGRESS: at 82.71% examples, 738435 words/s, in_qsize 16, out_qsize 0
INFO - 11:04:01: EPOCH 591: training on 2430609 raw words (1812967 effective words) took 2.3s, 791368 effective words/s
INFO - 11:04:02: EPOCH 592 - PROGRESS: at 39.84% examples, 703361 words/s, in_qsize 15, out_qsize 0
INFO - 11:04:03: EPOCH 592 - PROGRESS: at 94.86% examples, 855757 words/s, in_qsize 12, out_qsize 1
INFO - 11:04:03: EPOCH 592: training on 2430609 raw words (1813431 effective words) took 2.1s, 870840 effective words/s
INFO - 11:04:04: EPOCH 593 - PROGRESS: at 55.25% examples, 957216 words/s, in_qsize 15, out_qsize 0
INFO - 11:04:05: EPOCH 593 - PROGRESS: at 99.15% examples, 880919 words/s, in_qsize 2, out_qsize 1
INFO - 11:04:05: EPOCH 593: training on 2430609 raw words (1813671 effective words) took 2.0s, 886573 effective words/s
INFO - 11:04:06: EPOCH 594 - PROGRESS: at 52.59% examples, 937384 words/s, in_qsize 15, out_qsize 0
INFO - 11:04:07: EPOCH 594: training on 2

INFO - 11:04:56: EPOCH 627 - PROGRESS: at 73.63% examples, 1314074 words/s, in_qsize 13, out_qsize 2
INFO - 11:04:56: EPOCH 627: training on 2430609 raw words (1813179 effective words) took 1.4s, 1306984 effective words/s
INFO - 11:04:57: EPOCH 628 - PROGRESS: at 68.69% examples, 1220144 words/s, in_qsize 14, out_qsize 1
INFO - 11:04:58: EPOCH 628: training on 2430609 raw words (1812977 effective words) took 1.5s, 1240633 effective words/s
INFO - 11:04:59: EPOCH 629 - PROGRESS: at 70.31% examples, 1259635 words/s, in_qsize 15, out_qsize 0
INFO - 11:04:59: EPOCH 629: training on 2430609 raw words (1813254 effective words) took 1.4s, 1293640 effective words/s
INFO - 11:05:00: EPOCH 630 - PROGRESS: at 66.37% examples, 1186253 words/s, in_qsize 15, out_qsize 0
INFO - 11:05:01: EPOCH 630: training on 2430609 raw words (1812773 effective words) took 1.7s, 1089876 effective words/s
INFO - 11:05:02: EPOCH 631 - PROGRESS: at 61.88% examples, 1112132 words/s, in_qsize 15, out_qsize 0
INFO - 11:0

INFO - 11:05:56: EPOCH 664 - PROGRESS: at 60.66% examples, 1083453 words/s, in_qsize 15, out_qsize 0
INFO - 11:05:57: EPOCH 664: training on 2430609 raw words (1813289 effective words) took 1.7s, 1093706 effective words/s
INFO - 11:05:58: EPOCH 665 - PROGRESS: at 48.18% examples, 848786 words/s, in_qsize 15, out_qsize 0
INFO - 11:05:59: EPOCH 665 - PROGRESS: at 92.21% examples, 821647 words/s, in_qsize 16, out_qsize 0
INFO - 11:05:59: EPOCH 665: training on 2430609 raw words (1813450 effective words) took 2.1s, 843473 effective words/s
INFO - 11:06:00: EPOCH 666 - PROGRESS: at 63.46% examples, 1138531 words/s, in_qsize 13, out_qsize 2
INFO - 11:06:01: EPOCH 666: training on 2430609 raw words (1813140 effective words) took 1.5s, 1188324 effective words/s
INFO - 11:06:02: EPOCH 667 - PROGRESS: at 65.50% examples, 1172640 words/s, in_qsize 13, out_qsize 2
INFO - 11:06:02: EPOCH 667: training on 2430609 raw words (1812681 effective words) took 1.5s, 1174730 effective words/s
INFO - 11:06:0

INFO - 11:06:56: EPOCH 699: training on 2430609 raw words (1812797 effective words) took 1.6s, 1136258 effective words/s
INFO - 11:06:57: EPOCH 700 - PROGRESS: at 59.80% examples, 1069717 words/s, in_qsize 15, out_qsize 0
INFO - 11:06:57: EPOCH 700: training on 2430609 raw words (1812658 effective words) took 1.7s, 1040219 effective words/s
INFO - 11:06:58: EPOCH 701 - PROGRESS: at 50.91% examples, 901232 words/s, in_qsize 14, out_qsize 1
INFO - 11:06:59: EPOCH 701: training on 2430609 raw words (1812849 effective words) took 1.9s, 952293 effective words/s
INFO - 11:07:00: EPOCH 702 - PROGRESS: at 56.43% examples, 1005995 words/s, in_qsize 16, out_qsize 1
INFO - 11:07:01: EPOCH 702: training on 2430609 raw words (1813764 effective words) took 1.8s, 1022511 effective words/s
INFO - 11:07:02: EPOCH 703 - PROGRESS: at 47.77% examples, 850239 words/s, in_qsize 14, out_qsize 1
INFO - 11:07:03: EPOCH 703 - PROGRESS: at 94.88% examples, 853843 words/s, in_qsize 11, out_qsize 2
INFO - 11:07:03

INFO - 11:07:59: EPOCH 735: training on 2430609 raw words (1813669 effective words) took 1.6s, 1149230 effective words/s
INFO - 11:08:00: EPOCH 736 - PROGRESS: at 55.26% examples, 983577 words/s, in_qsize 15, out_qsize 4
INFO - 11:08:01: EPOCH 736: training on 2430609 raw words (1813059 effective words) took 1.9s, 959349 effective words/s
INFO - 11:08:02: EPOCH 737 - PROGRESS: at 47.04% examples, 833252 words/s, in_qsize 15, out_qsize 0
INFO - 11:08:03: EPOCH 737 - PROGRESS: at 94.50% examples, 845415 words/s, in_qsize 14, out_qsize 0
INFO - 11:08:03: EPOCH 737: training on 2430609 raw words (1813647 effective words) took 2.1s, 861797 effective words/s
INFO - 11:08:04: EPOCH 738 - PROGRESS: at 52.22% examples, 934742 words/s, in_qsize 14, out_qsize 1
INFO - 11:08:05: EPOCH 738: training on 2430609 raw words (1813247 effective words) took 1.9s, 951064 effective words/s
INFO - 11:08:06: EPOCH 739 - PROGRESS: at 47.77% examples, 844156 words/s, in_qsize 13, out_qsize 2
INFO - 11:08:07: EP

INFO - 11:08:58: EPOCH 768 - PROGRESS: at 67.07% examples, 1201459 words/s, in_qsize 15, out_qsize 0
INFO - 11:08:59: EPOCH 768: training on 2430609 raw words (1813447 effective words) took 1.5s, 1224666 effective words/s
INFO - 11:09:00: EPOCH 769 - PROGRESS: at 68.29% examples, 1227536 words/s, in_qsize 15, out_qsize 0
INFO - 11:09:00: EPOCH 769: training on 2430609 raw words (1812148 effective words) took 1.5s, 1232242 effective words/s
INFO - 11:09:01: EPOCH 770 - PROGRESS: at 67.07% examples, 1206524 words/s, in_qsize 15, out_qsize 0
INFO - 11:09:02: EPOCH 770: training on 2430609 raw words (1813324 effective words) took 1.5s, 1232861 effective words/s
INFO - 11:09:03: EPOCH 771 - PROGRESS: at 66.68% examples, 1192073 words/s, in_qsize 14, out_qsize 1
INFO - 11:09:03: EPOCH 771: training on 2430609 raw words (1813748 effective words) took 1.6s, 1155690 effective words/s
INFO - 11:09:04: EPOCH 772 - PROGRESS: at 62.75% examples, 1120325 words/s, in_qsize 14, out_qsize 1
INFO - 11:0

INFO - 11:09:57: EPOCH 804: training on 2430609 raw words (1813401 effective words) took 1.4s, 1317187 effective words/s
INFO - 11:09:58: EPOCH 805 - PROGRESS: at 72.39% examples, 1295745 words/s, in_qsize 15, out_qsize 0
INFO - 11:09:58: EPOCH 805: training on 2430609 raw words (1813213 effective words) took 1.4s, 1311436 effective words/s
INFO - 11:09:59: EPOCH 806 - PROGRESS: at 73.55% examples, 1324874 words/s, in_qsize 14, out_qsize 1
INFO - 11:10:00: EPOCH 806: training on 2430609 raw words (1813931 effective words) took 1.4s, 1330676 effective words/s
INFO - 11:10:01: EPOCH 807 - PROGRESS: at 71.55% examples, 1270400 words/s, in_qsize 14, out_qsize 1
INFO - 11:10:01: EPOCH 807: training on 2430609 raw words (1813051 effective words) took 1.4s, 1296953 effective words/s
INFO - 11:10:02: EPOCH 808 - PROGRESS: at 67.87% examples, 1214205 words/s, in_qsize 15, out_qsize 0
INFO - 11:10:03: EPOCH 808: training on 2430609 raw words (1812315 effective words) took 1.5s, 1245364 effective

INFO - 11:10:55: EPOCH 841: training on 2430609 raw words (1812916 effective words) took 1.8s, 1021999 effective words/s
INFO - 11:10:56: EPOCH 842 - PROGRESS: at 62.33% examples, 1096104 words/s, in_qsize 13, out_qsize 2
INFO - 11:10:56: EPOCH 842: training on 2430609 raw words (1812766 effective words) took 1.6s, 1139318 effective words/s
INFO - 11:10:57: EPOCH 843 - PROGRESS: at 63.55% examples, 1130273 words/s, in_qsize 13, out_qsize 2
INFO - 11:10:58: EPOCH 843: training on 2430609 raw words (1813559 effective words) took 1.6s, 1155167 effective words/s
INFO - 11:10:59: EPOCH 844 - PROGRESS: at 56.48% examples, 1017692 words/s, in_qsize 15, out_qsize 0
INFO - 11:11:00: EPOCH 844: training on 2430609 raw words (1812745 effective words) took 1.9s, 961455 effective words/s
INFO - 11:11:01: EPOCH 845 - PROGRESS: at 42.31% examples, 736970 words/s, in_qsize 15, out_qsize 0
INFO - 11:11:02: EPOCH 845 - PROGRESS: at 96.88% examples, 864462 words/s, in_qsize 7, out_qsize 2
INFO - 11:11:02

INFO - 11:11:54: EPOCH 878 - PROGRESS: at 60.30% examples, 1078912 words/s, in_qsize 16, out_qsize 2
INFO - 11:11:55: EPOCH 878: training on 2430609 raw words (1813055 effective words) took 1.7s, 1067424 effective words/s
INFO - 11:11:56: EPOCH 879 - PROGRESS: at 70.74% examples, 1256908 words/s, in_qsize 14, out_qsize 1
INFO - 11:11:56: EPOCH 879: training on 2430609 raw words (1812841 effective words) took 1.4s, 1287426 effective words/s
INFO - 11:11:57: EPOCH 880 - PROGRESS: at 71.98% examples, 1288432 words/s, in_qsize 15, out_qsize 0
INFO - 11:11:58: EPOCH 880: training on 2430609 raw words (1812904 effective words) took 1.4s, 1301474 effective words/s
INFO - 11:11:59: EPOCH 881 - PROGRESS: at 71.96% examples, 1282709 words/s, in_qsize 15, out_qsize 0
INFO - 11:11:59: EPOCH 881: training on 2430609 raw words (1812502 effective words) took 1.4s, 1294371 effective words/s
INFO - 11:12:00: EPOCH 882 - PROGRESS: at 69.95% examples, 1258906 words/s, in_qsize 14, out_qsize 1
INFO - 11:1

INFO - 11:12:46: EPOCH 915 - PROGRESS: at 73.55% examples, 1306387 words/s, in_qsize 14, out_qsize 1
INFO - 11:12:47: EPOCH 915: training on 2430609 raw words (1812641 effective words) took 1.4s, 1325976 effective words/s
INFO - 11:12:48: EPOCH 916 - PROGRESS: at 73.24% examples, 1306188 words/s, in_qsize 16, out_qsize 1
INFO - 11:12:48: EPOCH 916: training on 2430609 raw words (1813745 effective words) took 1.4s, 1319637 effective words/s
INFO - 11:12:49: EPOCH 917 - PROGRESS: at 71.55% examples, 1283793 words/s, in_qsize 15, out_qsize 0
INFO - 11:12:49: EPOCH 917: training on 2430609 raw words (1812735 effective words) took 1.4s, 1296004 effective words/s
INFO - 11:12:50: EPOCH 918 - PROGRESS: at 69.42% examples, 1239357 words/s, in_qsize 16, out_qsize 2
INFO - 11:12:51: EPOCH 918: training on 2430609 raw words (1813187 effective words) took 1.4s, 1275908 effective words/s
INFO - 11:12:52: EPOCH 919 - PROGRESS: at 73.96% examples, 1315404 words/s, in_qsize 15, out_qsize 0
INFO - 11:1

INFO - 11:13:37: EPOCH 952 - PROGRESS: at 73.63% examples, 1317997 words/s, in_qsize 13, out_qsize 2
INFO - 11:13:38: EPOCH 952: training on 2430609 raw words (1813842 effective words) took 1.4s, 1335831 effective words/s
INFO - 11:13:39: EPOCH 953 - PROGRESS: at 73.24% examples, 1299935 words/s, in_qsize 13, out_qsize 2
INFO - 11:13:39: EPOCH 953: training on 2430609 raw words (1812995 effective words) took 1.4s, 1305989 effective words/s
INFO - 11:13:40: EPOCH 954 - PROGRESS: at 73.13% examples, 1317186 words/s, in_qsize 15, out_qsize 0
INFO - 11:13:41: EPOCH 954: training on 2430609 raw words (1813219 effective words) took 1.4s, 1318605 effective words/s
INFO - 11:13:42: EPOCH 955 - PROGRESS: at 73.13% examples, 1316697 words/s, in_qsize 14, out_qsize 1
INFO - 11:13:42: EPOCH 955: training on 2430609 raw words (1813272 effective words) took 1.4s, 1322136 effective words/s
INFO - 11:13:43: EPOCH 956 - PROGRESS: at 71.55% examples, 1287726 words/s, in_qsize 15, out_qsize 0
INFO - 11:1

INFO - 11:14:29: EPOCH 989 - PROGRESS: at 73.96% examples, 1329767 words/s, in_qsize 15, out_qsize 0
INFO - 11:14:29: EPOCH 989: training on 2430609 raw words (1812574 effective words) took 1.4s, 1329911 effective words/s
INFO - 11:14:30: EPOCH 990 - PROGRESS: at 72.82% examples, 1301978 words/s, in_qsize 15, out_qsize 0
INFO - 11:14:31: EPOCH 990: training on 2430609 raw words (1812010 effective words) took 1.4s, 1324574 effective words/s
INFO - 11:14:32: EPOCH 991 - PROGRESS: at 73.23% examples, 1300478 words/s, in_qsize 13, out_qsize 2
INFO - 11:14:32: EPOCH 991: training on 2430609 raw words (1813492 effective words) took 1.4s, 1297887 effective words/s
INFO - 11:14:33: EPOCH 992 - PROGRESS: at 69.10% examples, 1237009 words/s, in_qsize 14, out_qsize 1
INFO - 11:14:34: EPOCH 992: training on 2430609 raw words (1813924 effective words) took 1.4s, 1269844 effective words/s
INFO - 11:14:35: EPOCH 993 - PROGRESS: at 73.96% examples, 1323158 words/s, in_qsize 15, out_qsize 0
INFO - 11:1

<gensim.models.word2vec.Word2Vec at 0x7f713eb3b280>

In [169]:
import numpy as np


my_sentiment_w2v = Word2Vec.load("my_sentiment_w2v.model")



voc = my_sentiment_w2v.wv.index_to_key
# get vector size
dim = my_sentiment_w2v.vector_size

# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):  
    embedding_matrix = np.zeros((len(voc), model.vector_size))
    word2idx = {}
    for i in range(len(voc)):
        embedding_matrix[i] = model.wv[model.wv.index_to_key[i]] 
    return embedding_matrix


embeddings_my_sentiment=to_embeddings_Matrix(my_sentiment_w2v)

print(np.shape(embeddings_my_sentiment))

INFO - 11:18:59: loading Word2Vec object from my_sentiment_w2v.model
INFO - 11:18:59: loading wv recursively from my_sentiment_w2v.model.wv.* with mmap=None
INFO - 11:18:59: setting ignored attribute cum_table to None
INFO - 11:19:00: Word2Vec lifecycle event {'fname': 'my_sentiment_w2v.model', 'datetime': '2023-04-06T11:19:00.052988', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'loaded'}


(51037, 100)


In [188]:
train_data,train_labels=create_corpus(read_samples(pos_train_dir), read_samples(neg_train_dir))
test_data,test_labels= create_corpus(read_samples(pos_test_dir), read_samples(neg_test_dir))


X_train,X_test=extract_nbow(my_sentiment_w2v,train_data,test_data)

In [191]:
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression().fit(X_train,train_labels)


In [192]:
from sklearn import metrics
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_pred,test_labels))



Accuracy: 0.5


In [177]:
from gensim.models import KeyedVectors
google_sentiment_w2v = KeyedVectors.load_word2vec_format('/home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', binary=True,
limit=1000000)

embeddings_google_sentiment = np.zeros((len(voc), google_sentiment_w2v.vector_size))
word2idx = {}
for i in range(len(voc)):
    embeddings_google_sentiment[i] = google_sentiment_w2v[google_sentiment_w2v.index_to_key[i]] 

print(np.shape(embeddings_google_sentiment))

INFO - 11:20:34: loading projection weights from /home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz
INFO - 11:20:58: KeyedVectors lifecycle event {'msg': 'loaded (1000000, 300) matrix of type float32 from /home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-06T11:20:58.411489', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'load_word2vec_format'}


(51037, 300)


In [178]:
train_data,train_labels=create_corpus(read_samples(pos_train_dir), read_samples(neg_train_dir))
test_data,test_labels= create_corpus(read_samples(pos_test_dir), read_samples(neg_test_dir))


X_google_train = np.zeros((np.size(train_data), 300))
for row, rev in enumerate(train_data):
    words_included = 0
    rev_toks = preproc_tok(rev)
    
    for tok in rev_toks:
        if tok in google_sentiment_w2v:
            X_google_train[row] += google_sentiment_w2v[tok]
            words_included += 1

    # Get the mean value
    X_google_train[row] = X_google_train[row]/words_included




X_google_test = np.zeros((np.size(test_data), 300)) 
for row, rev in enumerate(test_data):
    words_included = 0
    # Tokenize current review
    rev_toks = preproc_tok(rev)
    for tok in rev_toks:
        # For each token check if it has a w2v representation
        # and if yes add it.
        if tok in google_sentiment_w2v:
            X_google_test[row] += google_sentiment_w2v[tok]
            words_included += 1
    # Get the mean value
    X_google_test[row] = X_google_test[row]/words_included


In [184]:
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression().fit(X_google_train,train_labels)


In [187]:
from sklearn import metrics
y_pred = clf.predict(X_google_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_pred,test_labels))


Accuracy: 0.8345
