In [1]:
!pip install keras
!pip install tensorflow
!pip install plot_keras_history
!pip install seaborn





In [2]:

from keras.utils import np_utils
from keras.preprocessing import sequence

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, Reshape
from keras.layers import Input
from keras.models import Model
from keras.layers import dot
from tensorflow.keras.activations import relu
from nltk import word_tokenize, sent_tokenize
from gensim.corpora.dictionary import Dictionary
import numpy as np

from keras.preprocessing.sequence import skipgrams
import gensim


Using TensorFlow backend.


In [3]:
# using nltk tokenizer.  
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#Data Preparation 

AlotOftext = """Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in corpora, 
the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a relation 
between two phenomena is demonstrably non-random, does not support the inference 
that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis testing 
has been used, and show how it has often led to unhelpful or misleading results.""".lower()



#Tokenize text
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(AlotOftext)]
tokenized_text

#Create Vocab as a Dictionary
vocab = Dictionary(tokenized_text)
print(dict(vocab.items()))

print(vocab.token2id['corpora'])
print(vocab[2])
sent0 = tokenized_text[0]
print(vocab.doc2idx(sent0))

vocab.add_documents([['PAD']])
dict(vocab.items())
print(vocab.token2id['PAD'])

corpusByWordID = list()
for sent in  tokenized_text:
    corpusByWordID.append(vocab.doc2idx(sent))

vocab_size = len(vocab)
embed_size = 100
hidden_dim=100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(vocab.items())[:10])

{0: ',', 1: '.', 2: 'and', 3: 'choose', 4: 'essentially', 5: 'is', 6: 'language', 7: 'never', 8: 'non-random', 9: 'randomly', 10: 'users', 11: 'words', 12: 'a', 13: 'hypothesis', 14: 'null', 15: 'posits', 16: 'randomness', 17: 'statistical', 18: 'testing', 19: 'uses', 20: 'which', 21: 'at', 22: 'be', 23: 'corpora', 24: 'hence', 25: 'in', 26: 'linguistic', 27: 'look', 28: 'phenomena', 29: 'the', 30: 'true', 31: 'we', 32: 'when', 33: 'will', 34: '(', 35: ')', 36: 'able', 37: 'almost', 38: 'always', 39: 'data', 40: 'enough', 41: 'establish', 42: 'it', 43: 'moreover', 44: 'not', 45: 'shall', 46: 'that', 47: 'there', 48: 'to', 49: 'where', 50: 'arbitrary', 51: 'between', 52: 'corpus', 53: 'demonstrably', 54: 'do', 55: 'does', 56: 'fact', 57: 'frequently', 58: 'have', 59: 'inference', 60: 'relation', 61: 'so', 62: 'studies', 63: 'support', 64: 'two', 65: 'are', 66: 'associations', 67: 'evidence', 68: 'experimental', 69: 'frequencies', 70: 'how', 71: 'of', 72: 'present', 73: 'systematically',

In [5]:
# Create CBOW Training data
def generate_cbow_context_word_pairs(corpusByID, window_size, vocab_size):
    context_length = window_size*2
    X=[]
    Y=[]
    for sent in corpusByID:
        sentence_length = len(sent)
        for index, word in enumerate(sent):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sent[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)
            if start<0:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='pre',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            if end>=sentence_length:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='post',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            else:
                X.append(sequence.pad_sequences(context_words, maxlen=context_length))
                y = np_utils.to_categorical(label_word, vocab_size)
                Y.append(y)
                continue
           
    return X,Y
            
# Test this out for some samples


X,Y = generate_cbow_context_word_pairs(corpusByWordID, window_size, vocab_size) 
   
for x, y in zip(X,Y):
    print('Context (X):', [vocab[w] for w in x[0]], '-> Target (Y):', vocab[np.argwhere(y[0])[0][0]])


Context (X): ['PAD', 'PAD', 'users', 'never'] -> Target (Y): language
Context (X): ['PAD', 'language', 'never', 'choose'] -> Target (Y): users
Context (X): ['language', 'users', 'choose', 'words'] -> Target (Y): never
Context (X): ['users', 'never', 'words', 'randomly'] -> Target (Y): choose
Context (X): ['never', 'choose', 'randomly', ','] -> Target (Y): words
Context (X): ['choose', 'words', ',', 'and'] -> Target (Y): randomly
Context (X): ['words', 'randomly', 'and', 'language'] -> Target (Y): ,
Context (X): ['randomly', ',', 'language', 'is'] -> Target (Y): and
Context (X): [',', 'and', 'is', 'essentially'] -> Target (Y): language
Context (X): ['and', 'language', 'essentially', 'non-random'] -> Target (Y): is
Context (X): ['language', 'is', 'non-random', '.'] -> Target (Y): essentially
Context (X): ['is', 'essentially', '.', 'PAD'] -> Target (Y): non-random
Context (X): ['essentially', 'non-random', 'PAD', 'PAD'] -> Target (Y): .
Context (X): ['PAD', 'PAD', 'hypothesis', 'testing']

In [6]:

cbow = Sequential()

cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=4))
cbow.add(Lambda(lambda x: relu(K.mean(x, axis=1)), output_shape=(embed_size,)))

cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='sgd')
cbow.summary()



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            8800      
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 88)                8888      
Total params: 17,688
Trainable params: 17,688
Non-trainable params: 0
_________________________________________________________________


In [7]:
#Train the model

for epoch in range(200):
    loss = 0.
    for x, y in zip(X,Y):
        loss += cbow.train_on_batch(x, y)
    print(epoch, loss)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


0 644.7470927238464
1 642.842649936676
2 640.999680519104
3 639.2180123329163
4 637.4932975769043
5 635.8243727684021
6 634.2167117595673
7 632.6695871353149
8 631.1827216148376
9 629.753434419632
10 628.3805961608887
11 627.0611128807068
12 625.7937846183777
13 624.5779800415039
14 623.4111428260803
15 622.2898659706116
16 621.2124836444855
17 620.175265789032
18 619.1758034229279
19 618.2108869552612
20 617.2788717746735
21 616.3763751983643
22 615.50204205513
23 614.6517877578735
24 613.8225095272064
25 613.0132806301117
26 612.218649148941
27 611.4385242462158
28 610.6693015098572
29 609.9095468521118
30 609.1575658321381
31 608.4104459285736
32 607.6659007072449
33 606.9219174385071
34 606.1774842739105
35 605.4288029670715
36 604.6764075756073
37 603.9170689582825
38 603.1508069038391
39 602.3760697841644
40 601.5933713912964
41 600.8021206855774
42 600.0021638870239
43 599.1928384304047
44 598.3760108947754
45 597.5514776706696
46 596.7200784683228
47 595.8839212656021
48 595.04

In [8]:
## Save the wordvectors
f = open('Cbow_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = cbow.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [9]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./Cbow_vectors.txt', binary=False)

w2v.most_similar(positive=['language'])

[('non-random', 0.579158365726471),
 ('randomly', 0.42462313175201416),
 ('show', 0.4117108881473541),
 ('demonstrably', 0.34696492552757263),
 ('used', 0.34234124422073364),
 ('phenomena', 0.3087191879749298),
 ('not', 0.2967388927936554),
 ('true', 0.2952246069908142),
 ('there', 0.2752431035041809),
 ('so', 0.2749215066432953)]

In [10]:
#Create Skipgram Training data 

# generate skip-grams with both positive and negative examples
skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=2) for sent in corpusByWordID]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        vocab[pairs[i][0]], pairs[i][0],           
        vocab[pairs[i][1]], pairs[i][1], 
        labels[i]))

(words (11), systematically (73)) -> 0
(is (5), non-random (8)) -> 1
(never (7), uses (19)) -> 0
(choose (3), users (10)) -> 1
(randomly (9), able (36)) -> 0
(. (1), essentially (4)) -> 1
(users (10), choose (3)) -> 1
(users (10), language (6)) -> 1
(users (10), linguistic (26)) -> 0
(never (7), language (6)) -> 1


In [11]:
#define the skip-gram model

input_word = Input((1,))
input_context_word = Input((1,))

word_embedding    = Embedding(input_dim=vocab_size, output_dim=embed_size,input_length=1,name='word_embedding')
context_embedding = Embedding(input_dim=vocab_size, output_dim=embed_size,input_length=1,name='conotext_embedding')

word_embedding = word_embedding(input_word)
word_embedding_layer = Reshape((-1, 1))(word_embedding)

context_embedding = context_embedding(input_context_word)
context_embedding_layer = Reshape((-1, 1))(context_embedding)

# now perform the dot product operation  
dot_product = dot([word_embedding_layer, context_embedding_layer], axes=1)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
outputLayer = Dense(1, activation='softmax')(dot_product)

model = Model(inputs=[input_word, input_context_word], outputs=outputLayer)
model.compile(loss='binary_crossentropy', optimizer='adam')

# view model summary
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 1, 100)       8800        input_1[0][0]                    
__________________________________________________________________________________________________
conotext_embedding (Embedding)  (None, 1, 100)       8800        input_2[0][0]                    
____________________________________________________________________________________________

In [12]:
#train the model

for epoch in range(1, 100):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch: 1 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 4 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 5 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 6 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 7 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 8 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 9 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 10 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 11 Loss: 53.666337966918945
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 12 Loss: 53.

Epoch: 99 Loss: 53.666337966918945


In [13]:
#get the embeding matrix
weights = model.get_weights()
## Save the wordvectors
f = open('skipgram_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [14]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./skipgram_vectors.txt', binary=False)
w2v.most_similar(positive=['language'])


[('that', 0.17893089354038239),
 ('relation', 0.1536899358034134),
 ('users', 0.14901486039161682),
 ('corpora', 0.14564503729343414),
 ('at', 0.1416715681552887),
 ('support', 0.13820742070674896),
 ('experimental', 0.13639777898788452),
 ('have', 0.1275327354669571),
 ('present', 0.11706452071666718),
 ('almost', 0.11317339539527893)]

In [15]:
#Excerise: 
#modeify the skipegram_model to share the same embeding layer between word and context
#Discussion: which is better? Why?  

Answer : On a top level ,it appears with bigger corpus and by including unknown words also making the model context dependent will increase accuracy 


---

