# Word2Vec
- CBOW or Skip Gram

## Import Packages

In [158]:
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
from tqdm import tqdm_notebook
from nltk.corpus import stopwords

## Configurations

In [159]:
hyper = {
    "emb_size":100,
    "window_size": 5,
    "min_count":1,
    "sg":0,
    "negative_size": 5,
    "ns_exponent": 0.75,
    "cbow_mean":1,
    "iter":10, #epochs
    "alpha":0.025, # initial learning rate
    "sample": 0.00001, #defualt=0.001
    "vocab_size":0,
    "batch_size":512
}

In [160]:
stopwords_en = stopwords.words('english')

## Load Dataset
- IMDB 영화 리뷰 데이터
    - 텍스트 분류 중 감정 분류 모델 연습용으로 쓰임
    - Label: 영화 리뷰가 긍정이면 1, 부정이면 0
- word embedding을 생성하는데 적절한 데이터는 아닌것 같지만 임시로 사용한다.

In [161]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(
    num_words=None, # max number of words to include
    skip_top=0, # skip the top N most frequently occurring words
)
print(f"X_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {x_test.shape}, y_test: {y_test.shape}")

X_train: (25000,), y_train: (25000,)
X_test: (25000,), y_test: (25000,)


In [162]:
print(f"The num of categories: {len(set(y_train))}")

The num of categories: 2


In [163]:
word2index = tf.keras.datasets.imdb.get_word_index()
index2word = {}
for word, index in word2index.items() :
    index2word[index] = word

print(f"The num of words: {len(word2index)}")
word2index["woods"], index2word[1408]

The num of words: 88584


(1408, 'woods')

In [164]:
len(word2index.values())

88584

### Change from index to word

In [165]:
def change_from_index_to_word(index_list) :
    word_list = []
    for index in index_list :
        if index in index2word :
            word_list.append(index2word[index])
        else :
            print(f"{index} is not exist in index2word!")
    return word_list

In [166]:
train_texts = []
for indexs in x_train :
    train_texts.append(change_from_index_to_word(indexs))
len(train_texts)

88585 is not exist in index2word!
88586 is not exist in index2word!


25000

## Functions

In [167]:
def get_vocabulary(texts, min_count) :

    # Calculate frequency of word appearance.
    word2count = {}
    for text in texts :
        for word in text :
            if word in word2count :
                word2count[word] += 1
            else :
                word2count[word] = 1
    
    # Save only words that meet min_count condition.
    vocab = [word for word in word2count if word2count[word] >= min_count]
    filtered_word2count = {}
    for word in vocab :
        filtered_word2count[word] = word2count[word]
    print(f"The num of words that meet min_count >= {min_count}: {len(vocab):,}")
    
    return filtered_word2count, vocab

In [168]:
min_count = hyper.get("min_count")
word2count, vocab = get_vocabulary(train_texts, min_count)

The num of words that meet min_count >= 1: 88,583


In [169]:
hyper["vocab_size"] = len(vocab)

In [170]:
def generate_word_pairs(texts, word2count, vocab, hyper) :
    '''
    return [[target word(words)], [context word(words)], [negative word(word2)]]
    '''
    sample = hyper.get("sample")
    window_size = hyper.get("window_size")
    negative_size = hyper.get("negative_size")
    ns_exponent = hyper.get("ns_exponent")
    
    filtered = 0
    sampled = 0
    training_data = []
    negative_prob = np.array(list(word2count.values()))
    negative_prob = (negative_prob ** ns_exponent) / np.sum(negative_prob ** ns_exponent)
    
    for raw_text in tqdm_notebook(texts) :
        text = [word for word in raw_text if (word in vocab)&(word not in stopwords_en)]
        
        if len(text) > 1 :
            for i in range(len(text)) :
                # get target word
                target_word = text[i]
                target_index = vocab.index(text[i])
                
                # probability of excluding word from learning
                prob = 1 - np.sqrt(sample / word2count[target_word])
                if np.random.rand() < prob :
                    sampled += 1
                    continue
                        
                # get context words
                context_words = []
                context_indexs = []
                for j in range(i - window_size if i - window_size > 0 else 0, i + window_size if i + window_size <= len(text) - 1 else len(text) - 1) :
                    context_words.append(text[j])
                    context_indexs.append(vocab.index(text[j]))
                    
                # get negative words
                negative_words = list(np.random.choice(vocab, p=negative_prob, size=negative_size))
                negative_indexs = [vocab.index(negative_word) for negative_word in negative_words]
                
                # append rows
                training_data.append([target_index, context_indexs, negative_indexs])
        else :
            filtered += 1
                
    print(f"- The final training data: {len(training_data)}.")
    print(f"- Filtered {filtered} texts.")
    print(f"- Sampled {sampled} texts.")
    return training_data

In [171]:
training_data = generate_word_pairs(train_texts, word2count, vocab, hyper)

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

- The final training data: 834.
- Filtered 0 texts.
- Sampled 3239570 texts.


In [172]:
training_data[:5]

[[298,
  [294, 295, 296, 297, 89, 298, 299, 300, 301, 302],
  [743, 2404, 953, 727, 18444]],
 [1302,
  [387, 1299, 1300, 39, 1301, 1302, 1303, 1304, 1305, 29],
  [885, 8520, 558, 12138, 549]],
 [3114,
  [370, 3111, 89, 3112, 3113, 3114, 3115, 299, 1740, 3116],
  [3224, 29833, 15, 11437, 10045]],
 [3405,
  [3403, 1521, 89, 3404, 734, 3405, 3406, 145, 3407, 39],
  [365, 1137, 7639, 85016, 46251]],
 [201,
  [7296, 4150, 39, 7297, 7298, 201, 7299, 316, 7300, 478],
  [299, 21233, 26, 51009, 328]]]

## Define Model

In [173]:
hyper

{'emb_size': 100,
 'window_size': 5,
 'min_count': 1,
 'sg': 0,
 'negative_size': 5,
 'ns_exponent': 0.75,
 'cbow_mean': 1,
 'iter': 10,
 'alpha': 0.025,
 'sample': 1e-05,
 'vocab_size': 88583,
 'batch_size': 512}

In [185]:
window_size = hyper.get("window_size")
negative_size = hyper.get("negative_size")
batch_size = hyper.get("batch_size")
vocab_size = hyper.get("vocab_size")
emb_size = hyper.get("emb_size")
learning_rate = hyper.get("learning_rate")

In [179]:
target = tf.placeholder(dtype = tf.int32, shape = [None, 1], name = 'target')
context = tf.placeholder(dtype = tf.int32, shape = [None, 2 * window_size], name = 'context')
negative = tf.placeholder(dtype = tf.int32, shape = [None, negative_size], name = 'negative')

W = tf.Variable(
    initial_value=tf.random_normal_initializer(
        mean=0,
        stddev=0.05,
    )(
        shape=(vocab_size, emb_size), 
        dtype=tf.float32,
    ),
    trainable=True,
    name="W",
)
U = tf.Variable(
    initial_value=tf.random_normal_initializer(
        mean=0,
        stddev=0.05,
    )(
        shape=(vocab_size, emb_size),
        dtype=tf.float32,
    ),
    trainable=True,
    name="U",
)

In [200]:
target = tf.placeholder(dtype = tf.int32, shape = [batch_size, ], name = 'target')
context = tf.placeholder(dtype = tf.int32, shape = [batch_size, 2 * window_size], name = 'context')
negative = tf.placeholder(dtype = tf.int32, shape = [batch_size, negative_size], name = 'negative')

target_emb = tf.nn.embedding_lookup(U, target, name="target_embedding")
context_emb = tf.nn.embedding_lookup(W, context, name="context_embedding")
avg_context_emb = tf.reduce_mean(context_emb, axis=1)
negative_emb = tf.nn.embedding_lookup(W, negative, name="negative_embedding")

context_loss = - tf.reduce_sum(tf.multiply(avg_context_emb, target_emb), axis=1)
negative_loss = tf.math.log(tf.reduce_sum(tf.exp(tf.reduce_sum(tf.multiply(negative_emb, target_emb), axis=1))))

target_emb, context_emb, avg_context_emb, negative_emb, context_loss

(<tf.Tensor 'target_embedding_12/Identity:0' shape=(512, 100) dtype=float32>,
 <tf.Tensor 'context_embedding_8/Identity:0' shape=(512, 10, 100) dtype=float32>,
 <tf.Tensor 'Mean_10:0' shape=(512, 100) dtype=float32>,
 <tf.Tensor 'negative_embedding_8/Identity:0' shape=(512, 5, 100) dtype=float32>,
 <tf.Tensor 'Neg_7:0' shape=(512,) dtype=float32>)

In [201]:
target_neg = tf.keras.backend.repeat_elements(target, negative_size, axis=0)
target_neg

<tf.Tensor 'concat:0' shape=(2560,) dtype=int32>

In [198]:
tf.multiply(negative_emb, target_emb)

ValueError: Dimensions must be equal, but are 5 and 512 for 'Mul_16' (op: 'Mul') with input shapes: [512,5,100], [512,100].

In [177]:
target = tf.placeholder(dtype = tf.int32, shape = [None, 1], name = 'target')
context = tf.placeholder(dtype = tf.int32, shape = [None, 2 * window_size], name = 'context')
negative = tf.placeholder(dtype = tf.int32, shape = [None, negative_size], name = 'negative')

W = tf.Variable(
    initial_value=tf.random_normal_initializer(
        mean=0,
        stddev=0.05,
    )(
        shape=(vocab_size, emb_size), 
        dtype=tf.float32,
    ),
    trainable=True,
    name="W",
)
U = tf.Variable(
    initial_value=tf.random_normal_initializer(
        mean=0,
        stddev=0.05,
    )(
        shape=(vocab_size, emb_size),
        dtype=tf.float32,
    ),
    trainable=True,
    name="U",
)

target_emb = tf.nn.embedding_lookup(U, target, name="target_embedding")
context_emb = tf.nn.embedding_lookup(W, context, name="context_embedding")
avg_context_emb = tf.reduce_mean(context_emb, axis=0)
negative_emb = tf.nn.embedding_lookup(W, negative, name="negative_embedding")

context_loss = - tf.reduce_sum(tf.multiply(avg_context_emb, target_emb))
negative_loss = tf.math.log(tf.reduce_sum(tf.exp(tf.reduce_sum(tf.multiply(negative_emb, target_emb), axis=1))))
tot_loss = tf.add(context_loss, negative_loss, name="loss")

optimizer = tf.train.GradientDescentOptimizer(
    learning_rate=learning_rate,
)
train = optimizer.minimize(tot_loss)

init = tf.global_variables_initializer()

ValueError: None values not supported.

## Run

In [None]:
tf_config = tf.ConfigProto(allow_soft_placement = True) 
with tf.Session(config=tf_config) as session:
    tf.reset_default_graph()
    with tf.device(self.device_to_use):
        # 
        target = tf.placeholder(dtype = tf.int32, shape = [None], name = 'target')
        context = tf.placeholder(dtype = tf.int32, shape = [None], name = 'context')
        negative = tf.placeholder(dtype = tf.int32, shape = [None], name = 'negative')
        # 
        W = tf.Variable(
            initial_value=tf.random_normal_initializer(
                mean=0,
                stddev=0.05,
            )(
                shape=(hyper.get("vocab_size"), hyper.get("emb_size")), 
                dtype=tf.float32,
            ),
            trainable=True,
            name="W",
        )
        U = tf.Variable(
            initial_value=tf.random_normal_initializer(
                mean=0,
                stddev=0.05,
            )(
                shape=(hyper.get("vocab_size"), hyper.get("emb_size")),
                dtype=tf.float32,
            ),
            trainable=True,
            name="U",
        )

        target_emb = tf.nn.embedding_lookup(U, target, name="target_embedding")
        context_emb = tf.nn.embedding_lookup(W, context, name="context_embedding")
        avg_context_emb = tf.reduce_mean(context_emb, axis=0)
        negative_emb = tf.nn.embedding_lookup(W, negative, name="negative_embedding")

        context_loss = - tf.reduce_sum(tf.multiply(avg_context_emb, target_emb))
        negative_loss = tf.math.log(tf.reduce_sum(tf.exp(tf.reduce_sum(tf.multiply(negative_emb, target_emb), axis=1))))
        tot_loss = tf.add(context_loss, negative_loss, name="loss")

        optimizer = tf.train.GradientDescentOptimizer(learning_rate=hyper.get("learning_rate"))
        train = optimizer.minimize(tot_loss)

        init = tf.global_variables_initializer()
#         saver = tf.train.Saver()

tf_config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=tf_config) as session:
    session.run(init)
    
    # Train
    for target_word, context_words, negative_words in training_data :
        session.run(
            train, 
            feed_dict = {
                target:target_word,
                context:context_words,
                negative:negative_words,
            }
        )
        
    # Get W, U, Total-Loss
    weight, weight_ = session.run([W, U])
    print(weight.shape)
    print(weight_.shape)

Use tf.keras
- https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-cbow.html

In [53]:
vocab_size=10000
emb_size=100
window_size=5

In [60]:
cbow = tf.keras.models.Sequential()
cbow.add(
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=emb_size,
        embeddings_initializer="normal",
        input_length=window_size*2,
    )
)
cbow.add(
    tf.keras.layers.Lambda(
        lambda x: tf.keras.backend.mean(x, axis=1), output_shape=(emb_size,)
    )
)
cbow.add(
    tf.keras.layers.Dense(vocab_size, activation="softmax")
)
cbow.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 10, 100)           1000000   
_________________________________________________________________
lambda_9 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10000)             1010000   
Total params: 2,010,000
Trainable params: 2,010,000
Non-trainable params: 0
_________________________________________________________________
