### Word Embeding

### 1. Latent Semantic Analysis

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
string_corpus = ["A story string is simply a number of beads threaded along a cord. The beads may either be chosen at random or deliberately selected and sequenced. "]

In [18]:
vc = CountVectorizer()
vc.fit(string_corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [19]:
print(vc.transform(string_corpus).toarray())

[[1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


In [20]:
print(vc.get_feature_names())

['along', 'and', 'at', 'be', 'beads', 'chosen', 'cord', 'deliberately', 'either', 'is', 'may', 'number', 'of', 'or', 'random', 'selected', 'sequenced', 'simply', 'story', 'string', 'the', 'threaded']


### Word2Vec

In [10]:
# !python -m pip install -U gensim

In [13]:
from gensim.models import Word2Vec
from tensorflow.kerasas.preprocessing.text import one_hot

In [83]:
string_corpus = ["A story string is simply number of beads threaded along a cord.",
                 " The beads may either be chosen at random or deliberately selected & sequenced. ",
                "This is not right"]

In [84]:
#no. of unique words in string
voc_size = 50

### one hot encoding

In [85]:
o_h = [one_hot(words,voc_size) for words in string_corpus]

In [86]:
print(o_h)

[[5, 36, 20, 28, 27, 2, 30, 21, 48, 43, 5, 30], [49, 21, 27, 14, 38, 1, 16, 32, 4, 20, 3, 22], [37, 28, 14, 5]]


### embeding layer

In [87]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [88]:
#upto you
sent_len = 15

In [89]:
emb = pad_sequences(o_h,padding='pre',maxlen=sent_len)
#size of sentence should be same
print(emb)

[[ 0  0  0  5 36 20 28 27  2 30 21 48 43  5 30]
 [ 0  0  0 49 21 27 14 38  1 16 32  4 20  3 22]
 [ 0  0  0  0  0  0  0  0  0  0  0 37 28 14  5]]


In [90]:
dim = 10

In [91]:
model =Sequential()
model.add(Embedding(voc_size,10,input_length=sent_len))

In [92]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 15, 10)            500       
Total params: 500
Trainable params: 500
Non-trainable params: 0
_________________________________________________________________


In [93]:
print(model.predict(emb))

[[[ 1.44924186e-02  3.60862166e-03  1.77858360e-02  3.57534923e-02
    1.59968995e-02  2.39090361e-02  4.28031124e-02 -2.79583931e-02
    1.84229352e-02  9.76908952e-04]
  [ 1.44924186e-02  3.60862166e-03  1.77858360e-02  3.57534923e-02
    1.59968995e-02  2.39090361e-02  4.28031124e-02 -2.79583931e-02
    1.84229352e-02  9.76908952e-04]
  [ 1.44924186e-02  3.60862166e-03  1.77858360e-02  3.57534923e-02
    1.59968995e-02  2.39090361e-02  4.28031124e-02 -2.79583931e-02
    1.84229352e-02  9.76908952e-04]
  [-2.49325112e-03  2.31614597e-02 -2.47897636e-02 -1.83684751e-03
   -2.86380053e-02  2.73682810e-02 -4.07066457e-02  3.28123309e-02
   -2.14585308e-02 -4.17534262e-03]
  [ 4.92649786e-02  2.49779858e-02 -4.34058197e-02  1.67619064e-03
   -5.10412455e-03  4.93184365e-02 -2.01661941e-02  2.10427754e-02
   -1.01449490e-02  4.03807051e-02]
  [-3.98007147e-02  2.64741071e-02 -4.53222916e-03  1.22185796e-03
    3.12054642e-02 -1.91355348e-02 -3.84402759e-02  6.87316805e-03
    4.23523672e-

In [94]:
print(emb[1])

[ 0  0  0 49 21 27 14 38  1 16 32  4 20  3 22]


In [95]:
print(model.predict(emb)[1])

[[ 1.44924186e-02  3.60862166e-03  1.77858360e-02  3.57534923e-02
   1.59968995e-02  2.39090361e-02  4.28031124e-02 -2.79583931e-02
   1.84229352e-02  9.76908952e-04]
 [ 1.44924186e-02  3.60862166e-03  1.77858360e-02  3.57534923e-02
   1.59968995e-02  2.39090361e-02  4.28031124e-02 -2.79583931e-02
   1.84229352e-02  9.76908952e-04]
 [ 1.44924186e-02  3.60862166e-03  1.77858360e-02  3.57534923e-02
   1.59968995e-02  2.39090361e-02  4.28031124e-02 -2.79583931e-02
   1.84229352e-02  9.76908952e-04]
 [ 3.97063605e-02  4.86567132e-02  4.09346931e-02  1.98660158e-02
  -3.38624269e-02 -7.00817257e-03 -1.83076859e-02  3.77721712e-03
   4.27918918e-02  2.27375515e-02]
 [-4.82936166e-02 -1.18404850e-02 -1.08736157e-02 -4.81972359e-02
  -3.29766870e-02  7.35778734e-03 -4.44170088e-03 -2.56735217e-02
  -4.57475781e-02 -1.09298229e-02]
 [-3.34132202e-02  2.47212909e-02  1.15124807e-02  4.43231128e-02
   3.47482450e-02 -9.80661064e-03  2.93894894e-02  1.04855374e-03
  -2.41074320e-02  1.15505345e-02