## Other Embeddings

It is possible to use other embeddings and then do a dimensionality reduction. 

Two other embeddings include:

- [Google's](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) - https://code.google.com/archive/p/word2vec/
- [Stanford's](https://nlp.stanford.edu/projects/glove/) - https://nlp.stanford.edu/projects/glove/

The goal of this Notebook is to use these two different embeddings to try and see if different results come about with PCA / T-SNE.

### Load the Google embedding:

In [48]:
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)

In [5]:
def load_google_model():
    "Load the google vectors from the given filename"
    global google_model
    filename = 'data/google.bin'
    google_model = KeyedVectors.load_word2vec_format(filename, binary=True)

#load_google_model()

In [8]:
# Get the model_words from the other active notebook:
%store -r model_words

### PCA:

To start of with, the dimensionality reduction will only be done to two dimensions.

In [35]:
def create_pca():
    safe_words = []
    not_safe_words = []
    
    # Figure out which words are int the Google vocab:
    all_words = sorted(model_words)[1:]
    for word in all_words:
        try:
            vectors = google_model[word]
            safe_words.append([word, vectors])
        except:
            not_safe_words.append(word)
    
    # Create the PCA:
    pca = PCA(n_components=2, random_state=23)
    result = pca.fit_transform([word[1] for word in safe_words])
    return result, safe_words, not_safe_words

pca_result, safe_words, not_safe_words = create_pca()

The following words are not in the Google embedding:

In [34]:
not_safe_words

['apap',
 'ciwa',
 'copd',
 'dispo',
 'dxed',
 'etoh',
 'gerd',
 'haldol',
 'hiatal',
 'hospital1',
 'hospital3',
 'hospital6',
 'ivdu',
 'lastname',
 'lfts',
 'listerine',
 'lvef',
 'medquist36',
 'micu',
 'mrsa',
 'mucomyst',
 'name11',
 'name2',
 'name8',
 'namepattern1',
 'namepattern2',
 'namepattern4',
 'oopherectomy',
 'pcwp',
 'ptsd',
 'risperdal',
 'stitle',
 'tegretol',
 'trazadone',
 'turp',
 'utis',
 'valproic']

Now that we have the embeddings for PCA, we can create an interactive 2D plot:

In [44]:
def pca_interactive_plot(save=False, show_labels=False):
    "Creates the PCA interactive plot"
    labels = []
    data = []

    x = []
    y = []
    
    for i, word in enumerate(safe_words):
        x.append(pca_result[i, 0])
        y.append(pca_result[i, 1])
        labels.append(word[0])

    # Should we show markers / markers & text
    if show_labels:
        mode = "markers+text"
    else:
        mode = "markers"
        
    for i in range(len(x)):
        trace0= go.Scatter(
            x= [x[i]],
            y= [y[i]],
            mode= mode,
            text= labels[i],
            textposition='top center')
        data.append(trace0);
    
    title = "PCA for Google embedded vectors {} words"
    filename = "output/google-pca-interactive-plot.html"
    # Save the html file 
    if save:
        py.offline.plot({
            "data": data, 
            "layout": go.Layout(title=title.format(len(safe_words)), 
                                showlegend=False)
        }, auto_open=False, filename=filename)

    py.offline.iplot({
        "data": data, 
        "layout": go.Layout(title=title.format(len(safe_words)), 
                            showlegend=False)
    })
        

pca_interactive_plot()

### T-SNE:

The same thing but now with T-SNE:

In [49]:
def create_tsne(perplexity=40):
    "Create the t-SNE model"
    
    tokens = [google_model[word[0]] for word in safe_words]
    
    tsne_model = TSNE(perplexity=perplexity, n_components=2, init='pca', n_iter=2500, random_state=23)
    results = tsne_model.fit_transform(tokens)

    return results

tsne_result = create_tsne()

In [54]:
def tsne_interactive_plot(save=False, show_labels=False):
    "Creates the t-SNE interactive plot"
    labels = []
    data = []

    x = []
    y = []
    
    for i, word in enumerate(safe_words):
        x.append(tsne_result[i, 0])
        y.append(tsne_result[i, 1])
        labels.append(word[0])

    # Should we show markers / markers & text
    if show_labels:
        mode = "markers+text"
    else:
        mode = "markers"
        
    for i in range(len(x)):
        trace0= go.Scatter(
            x= [x[i]],
            y= [y[i]],
            mode= mode,
            text= labels[i],
            textposition='top center')
        data.append(trace0);
    
    title = "T-SNE for Google embedded vectors {} words"
    filename = "output/google-tsne-interactive-plot.html"
    # Save the html file 
    if save:
        py.offline.plot({
            "data": data, 
            "layout": go.Layout(title=title.format(len(safe_words)), 
                                showlegend=False)
        }, auto_open=False, filename=filename)

    py.offline.iplot({
        "data": data, 
        "layout": go.Layout(title=title.format(len(safe_words)), 
                            showlegend=False)
    })
        

tsne_interactive_plot()