## Import all the Libraries needed

In [1]:
import gensim
import smart_open
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import multiprocessing
from gensim.models.callbacks import CallbackAny2Vec

import nltk
from nltk.corpus import stopwords
import numpy as np

In [2]:
doc_embedding_size = 400

## Clean the memory using garbage collector

In [3]:
import gc
gc.collect()

0

## Get the total number of cores and see if its fast version of Gensim

In [4]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

## Read the whole dataset

In [5]:
stop_words = nltk.corpus.stopwords.words('english')
def pre_process(line):
    word_list = nltk.tokenize.word_tokenize(line)
    word_list = [w.strip().lower() for w in word_list]
    word_list = [w for w in word_list if w not in stop_words]
    return word_list

def read_corpus(fname, pre_processed=True):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        loop = 0
        for i, line in enumerate(f):
            if not pre_processed:
                yield gensim.models.doc2vec.TaggedDocument(pre_process(line), [i])
            else:
                yield gensim.models.doc2vec.TaggedDocument(line, [i])

            loop += 1
            if loop % 10000 == 0:
                print(loop)

            if loop == 1000000:
                break

In [6]:
documents = list(read_corpus('./data/dataPost1.txt'))

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000


In [7]:
len(documents)

1000000

## Epoch Logger for verbose print

In [8]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0
        self.batch = 1

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        model.save('./data/model-epoch' + str(self.epoch) + ".model")
        print("Epoch #{} end and saved".format(self.epoch))    
        self.epoch += 1

    def on_train_begin(self, model):
        print("train started")
        self.batch = 1

    def on_train_end(self, model):
        print("train ended")

## Model for Doc2Vec, Used PV-DBOW method

In [9]:
models = [
    # PV-DBOW 
    Doc2Vec(dm=0, dbow_words=1, vector_size=doc_embedding_size, window=8, min_count=2, epochs=10, workers=cores),
    # PV-DM w/average
#     Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=2, epochs =10, workers=cores),
]

### Build vocabulary for model

In [10]:
models[0].build_vocab(documents)
print(str(models[0]))
# models[1].reset_from(models[0])
# print(str(models[1]))

Doc2Vec(dbow+w,d400,n5,w8,mc2,s0.001,t16)


### Train model with the data set

In [None]:
for index, model in enumerate(models):
    %time model.train(documents, total_examples=model.corpus_count, epochs=10, callbacks=[EpochLogger()])
    model.save('./data/DbowModel' + str(index) + ".model")

### Load the pre-trained model

In [11]:
for index, model in enumerate(models):
    models[index] = Doc2Vec.load('./data/DbowModel' + str(index) + ".model")

### Functions for testing purpose

In [12]:
from pprint import pprint
def get(model, st):
    tokens = pre_process(st)
    print(tokens)
    new_vector = model.infer_vector(tokens)
    sims = model.docvecs.most_similar([new_vector], topn=20)
    for index, val in sims:
        print(str(documents[index].words.rstrip()) + " - " + str(val))

In [13]:
guess_it = "Original Penguin (PENH8) Men's Smith Solid Tie" 
for model in models:
    print(str(model))
    print("\n")
    print(guess_it)
    get(model, guess_it)
    
# Have to use dbow

Doc2Vec(dbow+w,d400,n5,w8,mc2,s0.001,t16)


Original Penguin (PENH8) Men's Smith Solid Tie
['original', 'penguin', '(', 'penh8', ')', 'men', "'s", 'smith', 'solid', 'tie']
original penguin men 's smith solid tie - 0.8894153833389282
original penguin men 's smith solid tie - 0.8893281817436218
original penguin men 's aliso solid tie - 0.8378052711486816
original penguin men 's zion solid tie - 0.8307269811630249
original penguin men 's zion solid tie - 0.8230248689651489
original penguin men 's pique solid tie - 0.8162338733673096
original penguin men 's pique solid tie - 0.8046824932098389
original penguin men 's bimini floral tie - 0.7894967198371887
original penguin men 's pique solid to-be-tied bowtie - 0.7888813614845276
original penguin men 's clemens plaid tie - 0.7862534523010254
original penguin men 's clemens plaid tie - 0.7828032374382019
original penguin men 's trevini stripe tie - 0.7779017686843872
original penguin men 's park check tie - 0.7688165903091431
original pengui

  if np.issubdtype(vec.dtype, np.int):


### Save the doc2vec embedding

In [14]:
!rm -r ./data/dbow.txt

In [15]:
models[0].docvecs.save_word2vec_format('./data/dbow.txt', binary=False)

### Load the Embeddings with Dimensions

In [19]:
EMBEDDING_FILE = "./data/dbow.txt"
DIMENSION_FILE = "./data/dataDimesions.txt"

!wc -l ./data/dbow.txt

1000001 ./data/dbow.txt


In [17]:
X = []
with open(EMBEDDING_FILE, "r") as ip:
    flg = 0
    
    for line in ip:
        if flg:
            data = line.split()
            del data[0]
            data = [float(i) for i in data]
            X.append(data)
        if flg%10000 == 0:
            print(flg)
        flg += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000


In [18]:
Y = []
with open(DIMENSION_FILE, "r") as op:
    for line in op:
        data = line.split()
        data = list(map(float, data))
        Y.append(data)
        if len(Y) == len(X):
            break

In [20]:
print(len(X), len(Y))

for index, doc in enumerate(X):
    if len(doc) != doc_embedding_size:
        del X[index]
        del Y[index]
#         print(index)
#         print(doc)

1000000 1000000


In [21]:
print(len(X), len(Y))
print(len(X[0]), len(Y[0]))

1000000 1000000
400 4


In [23]:
# print(X[:2])
# print(len(X), len(X[0]))
# print(len(Y), len(Y[0]))
# docs = np.array([np.array(X[i]) for i in range(1000000)])
# docs = np.array([x for x in X])
# target = np.array(Y)

print(list(filter(lambda l: len(l) == 1, X)))
docs = np.array(X, ndmin=2, dtype=float)
# docs = np.ndarray(shape=(1000000, 200), buffer=docs, dtype=float)
target = np.array(Y)

# ratio = int( docs.shape[0]/10 )
ratio = int( docs.shape[0] / 10 )
X_train = docs[ratio:]
X_test = docs[:ratio]
y_train = target[ratio:]
y_test = target[:ratio]

# print(docs[:2])
print(docs.ndim)
print(type(docs), type(docs[0]))

[]
2
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [24]:
print(docs.shape, target.shape)
# docs[0:2]

(1000000, 400) (1000000, 4)


# Neural Network


In [25]:
generations = 10000

In [26]:
from tensorflow import keras

model = keras.Sequential()
activation_function = keras.layers.PReLU()

# Input Layer
model.add(keras.layers.Dense(405, kernel_initializer=keras.initializers.random_uniform, input_dim=doc_embedding_size,
                             kernel_regularizer=keras.regularizers.l2(0.01),
                             bias_regularizer=keras.regularizers.l2(0.01), name="input"))
model.add(activation_function)

# Hidden Layer
model.add(keras.layers.Dense(405, kernel_initializer='normal', name="hidden1"))
model.add(activation_function)
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Dense(405, kernel_initializer='normal', name="hidden2"))
model.add(activation_function)
model.add(keras.layers.Dropout(0.2))

# Output Layer
model.add(keras.layers.Dense(4, kernel_initializer='normal', name="output"))

model.compile(loss='mean_absolute_error', optimizer="Adam", metrics=['mean_absolute_error'])

checkpoint = keras.callbacks.ModelCheckpoint("./temp/best.h5", monitor='val_loss', verbose=1, save_best_only=True,
                                                 mode='auto')

earlyStopping = keras.callbacks.EarlyStopping(patience=16, mode="auto")
callbacks_list = [earlyStopping]

# model = create_neural_model()
model.fit(X_train, y_train, epochs=generations, batch_size=256, validation_split=0.1, callbacks=callbacks_list)
print("Current one: ", model.evaluate(X_test, y_test, batch_size=256))

Train on 810000 samples, validate on 90000 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/1000