### New package installation
```pip install tensorflow_datasets```

```pip install ipywidgets```

```pip install nltk```
### Reference:
https://www.tensorflow.org/tutorials/text/word_embeddings

## Neural Network Language Model

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
file = open("Data/J. K. Rowling - Harry Potter 1 - Sorcerer's Stone",'r')
raw_data_1 = file.read()
file.close()

In [3]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize

from nltk.corpus import stopwords 
import nltk 
#nltk.download('punkt')
#nltk.download('stopwords')

In [4]:
stop_words = set(stopwords.words('english')) 
word_tokens = wordpunct_tokenize(raw_data_1)  
word_tokens = [w.lower() for w in word_tokens if not w in stop_words] 
word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 

In [5]:
vocab = set(word_tokens)
char_to_int = dict((c,i) for i,c in enumerate(vocab))
int_to_char = dict((i,c) for i,c in enumerate(vocab))

In [6]:
X = np.array([], dtype=np.float).reshape(0,4)
Y = np.array([])
Xwords=[]
Ywords=[]
window_size = 2
for i, word in enumerate(word_tokens):
    Xsub=np.zeros(2*window_size)
    Xsubwords=[]
    isetvalue=0
    for icontext in range(max(i-window_size,0), min(i+window_size, len(word_tokens)-1)+1):
        if icontext!=i:
            Xsub[isetvalue]=char_to_int[word_tokens[icontext]]
            Xsubwords.append(word_tokens[icontext])
            isetvalue=isetvalue+1
    X=np.vstack([X, Xsub])
    Xwords.append(Xsubwords)
    Y=np.append(Y,char_to_int[word])
    Ywords.append(word)

In [7]:
embedding_dim=100
vocab_size=len(vocab)
cbowNN = keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(vocab_size, activation='softmax')
])

In [8]:
cbowNN.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [9]:
history = cbowNN.fit(X, Y,epochs=10)

Train on 45231 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
cbowNN.layers[0].get_weights()[0].shape

(5730, 100)

In [14]:
char_to_int["harry"]

5059

In [15]:
char_to_int["potter"]

691

In [12]:
from numpy.linalg import norm
e = cbowNN.layers[0]
e(tf.constant(1168)).numpy()

array([-0.5238643 ,  0.30421928,  0.33147296, -0.634414  , -0.5769926 ,
        0.06840047,  0.22078611, -0.08689712, -0.6194999 ,  0.28277722,
        0.44462878,  0.4402804 ,  0.37890956,  0.44396782, -0.55256784,
       -0.5538398 , -0.31861556,  0.55215275,  0.15342586,  0.6087728 ,
        0.6459889 , -0.47282505, -0.26229027, -0.6077571 ,  0.6792281 ,
       -0.2994563 ,  0.1866997 , -0.46849182, -0.33732918, -0.47328424,
        0.5500591 ,  0.33660218,  0.3826662 ,  0.42157295,  0.5949989 ,
        0.29590735,  0.4989111 ,  0.4032873 , -0.52393204, -0.25388965,
        0.3997271 , -0.62224257, -0.2616375 , -0.3902542 ,  0.42076963,
       -0.56467456,  0.04902203,  0.49545494, -0.29873022,  0.2472937 ,
        0.42883047, -0.50888944, -0.03138658,  0.10933125,  0.4736284 ,
        0.20750256, -0.42921713, -0.3238205 , -0.5687047 , -0.64927644,
       -0.4396635 , -0.5443118 ,  0.39546123, -0.46802753,  0.58282965,
       -0.30756986, -0.27993557,  0.21802768,  0.4000741 , -0.29

In [23]:
wharry=e.get_weights()[0][5059]

In [22]:
wpotter=e.get_weights()[0][691]

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

# compute pairwise distance matrix
cosine_similarity([wharry, wpotter])

array([[ 0.9999998 , -0.04303762],
       [-0.04303762,  1.0000001 ]], dtype=float32)

In [28]:
for i in np.argsort(cbowNN.predict(tf.constant([5059]))[0])[::-1][:5]:
    print(int_to_char[i])

potter
felt
spun
forehead
bedspread
