### New package installation
```pip install tensorflow_datasets```

```pip install ipywidgets```

```pip install nltk```
### Reference:
https://www.tensorflow.org/tutorials/text/word_embeddings

## Neural Network Language Model

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
file = open("Data/J. K. Rowling - Harry Potter 1 - Sorcerer's Stone",'r')
raw_data_1 = file.read()
file.close()

In [3]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize

from nltk.corpus import stopwords 
import nltk 
#nltk.download('punkt')
#nltk.download('stopwords')

In [4]:
stop_words = set(stopwords.words('english')) 
word_tokens = wordpunct_tokenize(raw_data_1)  
word_tokens = [w.lower() for w in word_tokens if not w in stop_words] 
word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 

In [5]:
vocab = set(word_tokens)
char_to_int = dict((c,i) for i,c in enumerate(vocab))
int_to_char = dict((i,c) for i,c in enumerate(vocab))

In [6]:
X = np.array([], dtype=np.float).reshape(0,4)
Y = np.array([])
Xwords=[]
Ywords=[]
window_size = 2
for i, word in enumerate(word_tokens):
    Xsub=np.zeros(2*window_size)
    Xsubwords=[]
    isetvalue=0
    for icontext in range(max(i-window_size,0), min(i+window_size, len(word_tokens)-1)+1):
        if icontext!=i:
            Xsub[isetvalue]=char_to_int[word_tokens[icontext]]
            Xsubwords.append(word_tokens[icontext])
            isetvalue=isetvalue+1
    X=np.vstack([X, Xsub])
    Xwords.append(Xsubwords)
    Y=np.append(Y,char_to_int[word])
    Ywords.append(word)

In [20]:
X[10]

array([2944.,  361., 3971., 1837.])

In [25]:
int_to_char[1837]

'dursley'

In [27]:
embedding_dim=100
vocab_size=len(vocab)
cbowNN = keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(vocab_size, activation='softmax')
])

In [28]:
cbowNN.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [29]:
history = cbowNN.fit(X, Y,epochs=10)

Train on 45231 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [50]:
cbowNN.layers[0].get_weights()[0].shape

(5730, 100)

In [None]:
cbowNN.layers[0].get_weights()[]

In [51]:
char_to_int["harry"]

5390

In [60]:
char_to_int["potter"]

3514

In [58]:
cbowNN.layers[0].get_weights()[0][5390]

array([ 0.36831316, -0.02912644, -0.21903703,  0.6875895 ,  0.8045879 ,
       -0.79558593, -0.12857082,  0.09913688, -0.59537804, -0.30346167,
        0.55933696, -1.485484  ,  0.29664618,  0.9509776 , -0.72287655,
       -0.50011224,  0.2952182 ,  0.24317202,  0.88044626, -0.15706061,
       -0.60950226,  0.8858659 ,  0.468372  , -0.05869386, -0.7143432 ,
       -0.9341061 ,  0.5548508 , -1.0121855 , -0.36167455,  1.2411246 ,
        0.2763438 , -0.06958745, -0.05160766,  0.09539735,  0.7949856 ,
       -0.92463213, -1.2130483 ,  0.35378164,  0.8643375 , -0.43106723,
        0.41514868, -0.6613449 ,  0.8776589 ,  0.8124357 ,  0.40135014,
        0.6651431 , -0.5510709 , -0.38966134,  0.542265  ,  0.30247587,
       -0.2986687 , -0.43880197,  0.03767639, -0.49712464,  1.7145205 ,
        0.3445807 ,  0.1759958 ,  0.05973768,  0.6984467 , -0.47396457,
       -0.10744078, -0.5577389 , -0.45284694,  0.46038765,  0.13582891,
        0.6500594 ,  1.1517085 , -0.20004109,  0.03411349, -0.26

In [None]:
char_to_int["potter"]

In [59]:
from numpy.linalg import norm
e = cbowNN.layers[0]
e(tf.constant(5390)).numpy()

array([ 0.36831316, -0.02912644, -0.21903703,  0.6875895 ,  0.8045879 ,
       -0.79558593, -0.12857082,  0.09913688, -0.59537804, -0.30346167,
        0.55933696, -1.485484  ,  0.29664618,  0.9509776 , -0.72287655,
       -0.50011224,  0.2952182 ,  0.24317202,  0.88044626, -0.15706061,
       -0.60950226,  0.8858659 ,  0.468372  , -0.05869386, -0.7143432 ,
       -0.9341061 ,  0.5548508 , -1.0121855 , -0.36167455,  1.2411246 ,
        0.2763438 , -0.06958745, -0.05160766,  0.09539735,  0.7949856 ,
       -0.92463213, -1.2130483 ,  0.35378164,  0.8643375 , -0.43106723,
        0.41514868, -0.6613449 ,  0.8776589 ,  0.8124357 ,  0.40135014,
        0.6651431 , -0.5510709 , -0.38966134,  0.542265  ,  0.30247587,
       -0.2986687 , -0.43880197,  0.03767639, -0.49712464,  1.7145205 ,
        0.3445807 ,  0.1759958 ,  0.05973768,  0.6984467 , -0.47396457,
       -0.10744078, -0.5577389 , -0.45284694,  0.46038765,  0.13582891,
        0.6500594 ,  1.1517085 , -0.20004109,  0.03411349, -0.26

In [61]:
wharry=e.get_weights()[0][5390]

In [62]:
wpotter=e.get_weights()[0][3514]

In [63]:
from sklearn.metrics.pairwise import cosine_similarity

# compute pairwise distance matrix
cosine_similarity([wharry, wpotter])

array([[ 1.0000001 , -0.20145974],
       [-0.20145974,  0.99999994]], dtype=float32)

In [64]:
for i in np.argsort(cbowNN.predict(tf.constant([5390]))[0])[::-1][:5]:
    print(int_to_char[i])

potter
drooled
felt
dodged
forehead
