# Word Embeddings

### Libraries

In [None]:
# import libraries

import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

### Datasets

In [None]:
# load review sentiment data

review_df = pd.read_csv('data/review_sentiment.csv')

review_df

### Preprocessing

In [None]:
# preprocess the reviews using gensim

corpus = []
for review in review_df['text']:
    corpus.append(simple_preprocess(review))

In [None]:
# see content of processed reviews

for review in corpus[:5]:
    print('-' * 50)
    print(review)

### Dense embeddings

In [None]:
# documents representing reviews

documents = corpus

len(documents)

In [None]:
# see contents of documents

print(documents[0])

In [None]:
# model training

model = Word2Vec(documents, vector_size=150, window=10, min_count=2, workers=10, sg=1)

In [None]:
# embeddings

model.wv.get_vector("good")

In [None]:
# similarity

model.wv.most_similar(positive="good")

In [None]:
# similarity

model.wv.most_similar(positive="bad")

In [None]:
# save full model (including trainable vectors to resume training)

model.save("embeddings/reviews_model")

In [None]:
# save model word vectors

model.wv.save("embeddings/reviews_wv")

### Vector visualization

In [None]:
def reduce_dimensions(model, num_dimensions=2, words=[]):
    '''
    Reduce the dimensions of the word vectors using t-SNE for plotting
    '''
    vectors = []
    labels = []
    if (words == []):
        words = model.wv.index_to_key
    for word in words:
        vectors.append(model.wv[word])
        labels.append(word)
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)
    tsne = TSNE(n_components=num_dimensions, random_state=0, perplexity=4.0)
    vectors = tsne.fit_transform(vectors)
    return vectors, labels

In [None]:
def plot_word_vectors(x_vals, y_vals, labels, words=[]):
    '''
    Plot the reduced dimensions of the word vectors using matplotlib
    '''
    random.seed(0)
    x_vals_new = np.array([])
    y_vals_new = np.array([])
    labels_new = np.array([])
    if(words == []):
        x_vals_new = x_vals
        y_vals_new = y_vals
        labels_new = labels
    else:
        for i in range(len(labels)):
            if(labels[i] in words):
                x_vals_new = np.append(x_vals_new,x_vals[i])
                y_vals_new = np.append(y_vals_new,y_vals[i])
                labels_new = np.append(labels_new,labels[i])
    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals_new, y_vals_new)
    for i in range(len(labels_new)):
        plt.annotate(labels_new[i], (x_vals_new[i], y_vals_new[i]))
    plt.show()

In [None]:
# visualize word vectors

all_words = [word for doc in documents for word in doc]
words = random.sample(all_words, 50)
interesting_words = ['good', 'bad', 'excellent', 'poor', 'awesome', 'terrible', 'nice', 'horrible', 'satisfactory', 'food', 'delicious', 'tasty', 'disappointing', 'service', 'ambiance', 'atmosphere', 'pricey', 'presentation', 'menu', 'variety', 'fresh', 'cooked', 'flavorful', 'tasteless', 'overcooked', 'undercooked', 'friendly', 'rude', 'clean', 'dirty', 'crowded', 'spacious', 'cozy', 'noisy', 'expensive', 'affordable', 'authentic', 'traditional', 'flavor', 'love']

vectors, labels = reduce_dimensions(model, 3, interesting_words)
x_vals = [v[0] for v in vectors]
y_vals = [v[1] for v in vectors]

plot_word_vectors(x_vals, y_vals, labels, interesting_words)
