In [3]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
sess = tf.Session(config=config)
set_session(sess)

In [5]:
import numpy as np
from keras.models import Model # model is used for skipgram model
# Embedding : for embedding layer
# Reshape : to reshape the matrix of data
# Activation : for activation function
# Input : to recieve the data
from keras.layers import Embedding, Reshape, Activation, Input  
from keras.layers.merge import Dot # for dot product operation
from keras.utils import np_utils
from keras.utils.data_utils import get_file # to import file into the keras model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams # to implement our skipgram in keras
import gensim # for import saved word vectors and weights

In [18]:
import nltk
#nltk.download('punkt')
#nltk.download('wordnet')

In [6]:
# nltk methods for cleaning and preprocessing the data
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
# Pre-processing

remove_terms = punctuation + '0123456789' 

In [8]:
def preprocessing(text):
    words = word_tokenize(text)
    # remove all punctuation words and numbers in text
    tokens = [w for w in words if w.lower() not in remove_terms]
    # remove non-alphabetic letters
    tokens = [word for word in tokens if word.isalpha()]
    lemma = WordNetLemmatizer()
    # lemmatization process
    tokens = [lemma.lemmatize(word) for word in tokens]
    pre_processed_text = ' '.join(tokens) # make it into an string
    return pre_processed_text

In [17]:
corpus = open("History_of_Astronomy.txt", encoding="utf-8").readlines() # read this text line by line
# corpus = get_file('cosmos.txt', origin="http://www.gutenberg.org/files/8172/8172-0.txt") #saving file from the web

In [16]:
# pre-process the text into the list and ommit empty lines
corpus = [preprocessing(sentence) for sentence in corpus if sentence.strip() != '']

In [19]:
# This class allows to vectorize a text corpus, by turning each text into either a sequence of integers 
# (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each 
# token could be binary, based on word count, based on tf-idf...

tokenizer = Tokenizer() 
tokenizer.fit_on_texts(corpus) # fit every token from corpus into the tokenizer

In [20]:
# Converts a text to a sequence of words (or tokens). A list of words (or tokens).
X_train_tokens = tokenizer.texts_to_sequences(corpus)

In [48]:
X_train_tokens # each element of this list is a list of index of sentence

[[2754, 1860, 2755, 2756, 2, 160, 2, 50, 8, 562, 977],
 [],
 [],
 [1410, 160, 2, 50],
 [],
 [1159, 562, 977],
 [],
 [350, 135, 635, 111, 22, 1, 2757, 8, 2758, 5, 2759],
 [1160, 636],
 [],
 [160, 2, 50],
 [],
 [8],
 [],
 [562, 977],
 [351, 6, 509, 170, 153, 351, 2760, 65, 308],
 [],
 [1861, 563, 2, 739, 1411, 2761, 1160, 1412],
 [],
 [1159, 2, 1, 352, 2, 309, 2762, 57, 2, 1, 2763, 2],
 [2764, 462, 462],
 [],
 [],
 [],
 [],
 [2765],
 [],
 [1862],
 [],
 [129, 274, 1, 564, 78],
 [],
 [79, 1161, 50, 3, 978],
 [],
 [102, 275, 50, 235, 3, 637],
 [],
 [123, 275, 638, 50],
 [],
 [112, 1, 1162, 2, 259, 22, 428, 4, 130],
 [],
 [129, 565, 1, 463, 78],
 [],
 [227, 93, 2, 1, 149, 52, 70, 103, 141, 126],
 [],
 [246, 228, 3, 1, 107, 979, 2, 510, 8, 1163, 462],
 [],
 [292, 135, 635, 111, 100, 2, 353, 136],
 [],
 [374, 236, 980, 408, 740, 834, 429, 462],
 [],
 [330, 93, 2, 56, 64, 127, 639, 331, 3, 310],
 [354],
 [],
 [129, 835, 97],
 [],
 [],
 [355, 154, 2, 836, 356, 2, 1, 52, 70],
 [],
 [566, 160, 2, 

In [26]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

5527

In [29]:
items = tokenizer.word_index.items() # a list of tuples with each word and it's index

<img src="files/word2vec.png">

<img src="files/word2vec skipgram.png">

In skipgram model we try to find context words based on the target word we give to the model<br>
1- at first layer we have context word and target word that we multyplie both by embedding vector to find the vector for the word<br>
2- second we make a dot product between two vectors to find the similarity between them<br>
3- at last we put similairity to sigmoid function (0 negative sample / 1 true context)<br>

In [None]:
# functional vs sequentional keras

# sequential
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(2, input_dim=1)) 
model.add(Dense(1))
#=============================================
#functional
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense

# Define the input
#   Unlike the Sequential model, you must create and define 
#   a standalone "Input" layer that specifies the shape of input 
#   data. The input layer takes a "shape" argument, which is a 
#   tuple that indicates the dimensionality of the input data.
#   When input data is one-dimensional, such as the MLP, the shape 
#   must explicitly leave room for the shape of the mini-batch size 
#   used when splitting the data when training the network. Hence, 
#   the shape tuple is always defined with a hanging last dimension.
#   For instance, "(2,)", as in the example below:
visible = Input(shape=(2,))

# Connecting layers
#   The layers in the model are connected pairwise.
#   This is done by specifying where the input comes from when 
#   defining each new layer. A bracket notation is used, such that 
#   after the layer is created, the layer from which the input to 
#   the current layer comes from is specified.
#   Note how the "visible" layer connects to the "Dense" layer:
hidden = Dense(2)(visible) 

# Create the model
#   After creating all of your model layers and connecting them 
#   together, you must then define the model.
#   As with the Sequential API, the model is the thing that you can
#   summarize, fit, evaluate, and use to make predictions.
#   Keras provides a "Model" class that you can use to create a model 
#   from your created layers. It requires that you only specify the 
#   input and output layers. For example:
model = Model(inputs=visible, outputs=hidden)

In [33]:
## creating the model ##

# here we create the model in the functional way. create all layers then add them all at the end to Model object

dim_embedding = 300 # size of embedding vector for each word

# target word
inputs = Input(shape=(1, ), dtype='int32') # Input layer, returns a tensor
w = Embedding(input_dim=vocab_size, output_dim=dim_embedding)(inputs) # Embedding layer (embedding matrix containing the word vectors)

# context word
c_inputs = Input(shape=(1, ), dtype='int32') # Input layer
c = Embedding(input_dim=vocab_size, output_dim=dim_embedding)(c_inputs) # Embedding layer

# Layer that computes a dot product between samples in two tensors.
d = Dot(axes=2)([w, c])

# this layer Reshapes an output to a certain shape.
d = Reshape((1, ), input_shape=(1,1))(d)
# activation layer with sigmoid function returning zero or 1
d = Activation('sigmoid')(d)

# create the model based on above layers
model = Model(inputs=[inputs, c_inputs], outputs=d)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 300)       1658100     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 300)       1658100     input_2[0][0]                    
_____________________________________

In [34]:
model.compile(loss='binary_crossentropy', optimizer='adam') # binary ce since we have only two options

In [36]:
## training the model ##

epochs = 15
for epoch in range(epochs):
    loss = 0.
    for i, doc in enumerate(X_train_tokens):
#  Generates skipgram word pairs. This function transforms a sequence of word indexes (list of integers) into tuples of words of the form:
# (word, word in the same window), with label 1 (positive samples).
# (word, random word from the vocabulary), with label 0 (negative samples).
# returns : couples, labels: where couples are int pairs and labels are either 0 or 1.
                           # each sentence     # all words we have from corpus  # size of window of context for words before and after target word
        data, labels = skipgrams(sequence=doc, vocabulary_size=vocab_size, window_size=4)
        x = [np.array(x) for x in zip(*data)] # this is an array of tuples, for every word in sentence and a context word
        y= np.array(labels, dtype=np.int32) # this label means if it is context or negative sample
        if x:
            loss += model.train_on_batch(x, y) # Runs a single gradient update on a single batch of data.
            # x: Numpy array of training data, or list of Numpy arrays if the model has multiple inputs.
            # y: Numpy array of target data, or list of Numpy arrays if the model has multiple outputs.
            
    print("epoch loss", epoch, loss)

Instructions for updating:
Use tf.cast instead.
epoch loss 0 1948.5970275253057
epoch loss 1 1566.748060464859
epoch loss 2 1445.305544987321
epoch loss 3 1314.4776121117175
epoch loss 4 1189.0299178659916
epoch loss 5 1073.8430150114
epoch loss 6 976.7469125986099
epoch loss 7 901.4775159189012
epoch loss 8 837.401306359563
epoch loss 9 779.625175289344
epoch loss 10 738.9874407122843
epoch loss 11 704.6018037195026
epoch loss 12 675.9803998890275
epoch loss 13 658.584346789954
epoch loss 14 641.6846630402324


In [39]:
# saving the weights of network in a file
f = open('word2vec_skipgram.txt', 'w', encoding='utf8')
f.write('{} {}\n'.format(vocab_size-1, dim_embedding))

weights = model.get_weights()[0]
for word, i in items:
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(weights[i, :])))))
f.close()

In [52]:
# load model 
w2v = gensim.models.KeyedVectors.load_word2vec_format("word2vec_skipgram.txt", binary=False)

In [53]:
# find best context words to word 'solar' in our corpus based on trained word2vec model
w2v.most_similar(positive=['solar'])

[('supposes', 0.5485150814056396),
 ('monde', 0.5330633521080017),
 ('system', 0.4482644200325012),
 ('gaseous', 0.4340745508670807),
 ('activity', 0.40255582332611084),
 ('unit', 0.3937888443470001),
 ('viewed', 0.37305599451065063),
 ('storms', 0.3723365068435669),
 ('simultaneous', 0.37132972478866577),
 ('substitute', 0.37118256092071533)]

In [54]:
w2v.most_similar(positive=['system'])

[('monde', 0.5205080509185791),
 ('supposes', 0.49431484937667847),
 ('advocated', 0.4645763039588928),
 ('solar', 0.4482644200325012),
 ('harmful', 0.42978760600090027),
 ('ptolemean', 0.42913752794265747),
 ('effected', 0.3838561177253723),
 ('pythagorean', 0.3827584385871887),
 ('tychonic', 0.38010546565055847),
 ('answered', 0.3658300042152405)]

In [55]:
w2v.most_similar(positive=['kepler'])

[('johannes', 0.5839917659759521),
 ('independence', 0.5480961203575134),
 ('johann', 0.5170408487319946),
 ('strassburg', 0.4880028963088989),
 ("reitlinger's", 0.4850509464740753),
 ('wanderer', 0.4340655505657196),
 ('opened', 0.43373048305511475),
 ('contemplated', 0.41645222902297974),
 ('abolish', 0.3921545445919037),
 ('monatliche', 0.36032575368881226)]