# Baseline Model: Skip Gram using Gensim-Word2Vec
- https://radimrehurek.com/gensim/models/word2vec.html

## Import Packages

In [34]:
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm_notebook
from nltk.corpus import stopwords
from gensim.models import Word2Vec

## Configurations

In [35]:
hyper = {
    "emb_size":100,
    "window_size": 5,
    "min_count":1,
    "sg":1,
    "negative_size": 5,
    "ns_exponent": 0.75,
    "cbow_mean":1,
    "iter":10, #epochs
    "learning_rate":0.025, # initial learning rate
    "sample": 0.1, #defualt=0.001
#     "vocab_size":0,
    "batch_size":10000,
#     "update_size":32,
}

In [36]:
stopwords_en = stopwords.words('english')

## Load Dataset
- reuters
    - Reuters topic classification dataset.

In [37]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.reuters.load_data(
    num_words=None, # max number of words to include
    skip_top=0, # skip the top N most frequently occurring words
)
print(f"X_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {x_test.shape}, y_test: {y_test.shape}")

X_train: (8982,), y_train: (8982,)
X_test: (2246,), y_test: (2246,)


In [38]:
print(f"The num of categories: {len(set(y_train))}")

The num of categories: 46


In [39]:
word2index = tf.keras.datasets.imdb.get_word_index()
index2word = {}
for word, index in word2index.items() :
    index2word[index] = word

print(f"The num of words: {len(word2index)}")
word2index["woods"], index2word[1408]

The num of words: 88584


(1408, 'woods')

In [40]:
len(word2index.values())

88584

### Change from index to word

In [41]:
def change_from_index_to_word(index_list) :
    word_list = []
    for index in index_list :
        if index in index2word :
            word_list.append(index2word[index])
        else :
            print(f"{index} is not exist in index2word!")
    return word_list

In [42]:
train_texts = []
for indexs in x_train :
    train_texts.append(change_from_index_to_word(indexs))
len(train_texts)

8982

## Run

In [43]:
emb_size = hyper.get("emb_size")
window_size = hyper.get("window_size")
min_count = hyper.get("min_count")
sg = hyper.get("sg")
negative_size = hyper.get("negative_size")
ns_exponent = hyper.get("ns_exponent")
cbow_mean = hyper.get("cbow_mean")
epochs = hyper.get("iter")
learning_rate = hyper.get("learning_rate")
sample = hyper.get("sample")
batch_size = hyper.get("batch_size")
# update_size = hyper.get("update_size")

In [44]:
model = Word2Vec(
    train_texts, 
    size=emb_size, 
    window=window_size, 
    min_count=min_count, 
    sg=sg,
    negative=negative_size,
    ns_exponent=ns_exponent,
    cbow_mean=cbow_mean,
    iter=epochs,
    alpha=learning_rate,
    sample=sample,
    batch_words=batch_size,
)

In [45]:
vocab = list(model.wv.vocab.keys())
len(vocab), vocab[:5]

(28138, ['the', 'kazuo', 'operandi', 'in', 'out'])

In [46]:
model.wv[vocab[0]].shape

(100,)

## Evaluate Word Embedding

In [47]:
path_for_wordsim353 = "./EnglishWordSimilarityDataset/wordsim_353.csv"
path_for_simlex999 = "./EnglishWordSimilarityDataset/simlex_999.csv"
path_for_card660 = "./EnglishWordSimilarityDataset/card_660.csv"

In [48]:
def cos_sim(a, b) :
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [49]:
for path_for_word_sim in [path_for_wordsim353, path_for_simlex999, path_for_card660] :
    
    # Load Word Similarity DataSet
    df_word_sim = pd.read_csv(path_for_word_sim)
    print(f"\nLoad Word Similarity Score from {path_for_word_sim}")
    
    # Calculate Cosine Similarity Score
    df_word_sim = df_word_sim[(df_word_sim['word1'].isin(vocab))&(df_word_sim["word2"].isin(vocab))]
    df_word_sim["cosim"] = df_word_sim.apply(lambda x: cos_sim(model.wv[x["word1"]], model.wv[x["word2"]]), axis=1)

    # Word Embedding Evaluation
    pearson, p_value = pearsonr(df_word_sim.score.values, df_word_sim.cosim.values)
    print(f"- Pearson Corr: {pearson:.5}, P-value: {p_value:.5}")
    spearman, p_value = spearmanr(df_word_sim.score.values, df_word_sim.cosim.values)
    print(f"- Spearman Corr: {spearman:.5}, P-value: {p_value:.5}")



Load Word Similarity Score from ./EnglishWordSimilarityDataset/wordsim_353.csv
- Pearson Corr: -0.021741, P-value: 0.71332
- Spearman Corr: -0.047402, P-value: 0.42291

Load Word Similarity Score from ./EnglishWordSimilarityDataset/simlex_999.csv
- Pearson Corr: 0.09763, P-value: 0.0026606
- Spearman Corr: 0.094544, P-value: 0.0036254

Load Word Similarity Score from ./EnglishWordSimilarityDataset/card_660.csv
- Pearson Corr: -0.074508, P-value: 0.64773
- Spearman Corr: -0.12451, P-value: 0.44399
