## Other Embeddings

It is possible to use other embeddings and then do a dimensionality reduction. 

Two other embeddings include:

- [Google's](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) - https://code.google.com/archive/p/word2vec/
- [Stanford's](https://nlp.stanford.edu/projects/glove/) - https://nlp.stanford.edu/projects/glove/

The goal of this Notebook is to use these two different embeddings to try and see if different results come about with PCA / T-SNE.

### Load the Google embedding:

In [13]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)

In [15]:
def load_google_model():
    "Load the google vectors from the given filename"
    global google_model
    filename = 'data/google.bin'
    google_model = KeyedVectors.load_word2vec_format(filename, binary=True)

load_google_model()

In [2]:
# Get the model_words from the other active notebook:
%store -r model_words

### PCA:

To start of with, the dimensionality reduction will only be done to two dimensions.

In [16]:
def create_pca(model):
    "Create the PCA model"
    safe_words = []
    not_safe_words = []
    
    # Figure out which words are int the Google vocab:
    all_words = sorted(model_words)[1:]
    for word in all_words:
        try:
            vectors = model[word]
            safe_words.append([word, vectors])
        except:
            not_safe_words.append(word)
            
    print(safe_words)
    print("AND")
    print(not_safe_words)
    
    # Create the PCA:
    pca = PCA(n_components=2, random_state=23)
    result = pca.fit_transform([word[1] for word in safe_words])
    return result, safe_words, not_safe_words

In [None]:
pca_result, safe_words, not_safe_words = create_pca(google_model)

The following words are not in the Google embedding:

In [17]:
not_safe_words

['apap',
 'ciwa',
 'copd',
 'dispo',
 'dxed',
 'etoh',
 'gerd',
 'haldol',
 'hiatal',
 'hospital1',
 'hospital3',
 'hospital6',
 'ivdu',
 'lastname',
 'lfts',
 'listerine',
 'lvef',
 'medquist36',
 'micu',
 'mrsa',
 'mucomyst',
 'name11',
 'name2',
 'name8',
 'namepattern1',
 'namepattern2',
 'namepattern4',
 'oopherectomy',
 'pcwp',
 'ptsd',
 'risperdal',
 'stitle',
 'tegretol',
 'trazadone',
 'turp',
 'utis',
 'valproic']

Now that we have the embeddings for PCA, we can create an interactive 2D plot:

In [24]:
def pca_interactive_plot(words, pca_result, title, filename, save=False, show_labels=False):
    "Creates the PCA interactive plot"
    labels = []
    data = []

    x = []
    y = []
    
    for i, word in enumerate(words):
        x.append(pca_result[i, 0])
        y.append(pca_result[i, 1])
        labels.append(word[0])

    # Should we show markers / markers & text
    if show_labels:
        mode = "markers+text"
    else:
        mode = "markers"
        
    for i in range(len(x)):
        trace0= go.Scatter(
            x= [x[i]],
            y= [y[i]],
            mode= mode,
            text= labels[i],
            textposition='top center')
        data.append(trace0);
    
    # Save the html file 
    if save:
        py.offline.plot({
            "data": data, 
            "layout": go.Layout(title=title.format(len(safe_words)), 
                                showlegend=False)
        }, auto_open=False, filename=filename)

    py.offline.iplot({
        "data": data, 
        "layout": go.Layout(title=title.format(len(safe_words)), 
                            showlegend=False)
    })
        

pca_interactive_plot(safe_words, pca_result, title = "PCA for Google embedded vectors {} words", 
                     filename = "output/google-pca-interactive-plot.html")

### T-SNE:

The same thing but now with T-SNE:

In [29]:
def create_tsne(model, words, perplexity=40):
    "Create the t-SNE model"
    
    tokens = [model[word[0]] for word in words]
    
    tsne_model = TSNE(perplexity=perplexity, n_components=2, init='pca', n_iter=2500, random_state=23)
    results = tsne_model.fit_transform(tokens)

    return results

tsne_result = create_tsne(google_model, safe_words)

In [28]:
def tsne_interactive_plot(words, tsne_result, title, filename, save=False, show_labels=False):
    "Creates the t-SNE interactive plot"
    labels = []
    data = []

    x = []
    y = []
    
    for i, word in enumerate(words):
        x.append(tsne_result[i, 0])
        y.append(tsne_result[i, 1])
        labels.append(word[0])

    # Should we show markers / markers & text
    if show_labels:
        mode = "markers+text"
    else:
        mode = "markers"
        
    for i in range(len(x)):
        trace0= go.Scatter(
            x= [x[i]],
            y= [y[i]],
            mode= mode,
            text= labels[i],
            textposition='top center')
        data.append(trace0);
    
    # Save the html file 
    if save:
        py.offline.plot({
            "data": data, 
            "layout": go.Layout(title=title.format(len(safe_words)), 
                                showlegend=False)
        }, auto_open=False, filename=filename)

    py.offline.iplot({
        "data": data, 
        "layout": go.Layout(title=title.format(len(safe_words)), 
                            showlegend=False)
    })
        

tsne_interactive_plot(safe_words, tsne_result, title = "T-SNE for Google embedded vectors {} words", 
                     filename = "output/google-tsne-interactive-plot.html")

### Stanford:

A similar analysis (PCA / T-SNE) will now be performed using the Stanford Glove embeddings:

In [5]:
def convert_2word2vec(, word2vec_output_file):
    "Takes Stanford's file format and converts it into a file that is useable by Gensim"
    
    print('Working on converting {} to {}'.format(glove_input_file, word2vec_output_file))
    glove2word2vec(glove_input_file, word2vec_output_file)
    print('Done with the conversion')
    
convert_2word2vec(glove_input_file = 'data/glove.6B.50d.txt', word2vec_output_file = 'data/glove.6B.50d.word2vec')

Working on converting data/glove.6B.50d.txt to data/glove.6B.50d.word2vec
Done with the conversion


In [10]:
def load_stanford_model(filename, binary=False):
    "Load the vectors from the given filename"
    return KeyedVectors.load_word2vec_format(filename, binary=binary)

In [None]:
stanford_model = load_stanford_model(filename = 'data/glove.6B.50d.word2vec')

In [11]:
pca_stan_result, safe_words_stan, not_safe_words_stan = create_pca(stanford_model)

The following are words which were not able to be found in the Stanford model:

In [13]:
not_safe_words_stan

['ambulating',
 'benzos',
 'ciwa',
 'colectomy',
 'dispo',
 'dxed',
 'extubated',
 'flexeril',
 'hemodynamically',
 'hospital1',
 'hospital3',
 'hospital6',
 'hypothyroid',
 'illicits',
 'ivdu',
 'lvef',
 'medquist36',
 'mucomyst',
 'name11',
 'name2',
 'name8',
 'namepattern1',
 'namepattern2',
 'namepattern4',
 'noncontributory',
 'oopherectomy',
 'pcwp',
 'polysubstance',
 'stitle',
 'trazadone']

In [26]:
pca_interactive_plot(safe_words_stan, pca_stan_result, title = "PCA for Stanford embedded vectors {} words",
                    filename = "output/stanford-pca-interactive-plot.html")

In [30]:
tsne_stan_result = create_tsne(stanford_model, safe_words_stan)

In [33]:
tsne_interactive_plot(safe_words_stan, tsne_stan_result, title = "TSNE for Stanford embedded vectors {} words",
                    filename = "output/stanford-tsne-interactive-plot.html")

## Medical Stanford Embeddings

After finding [this paper](https://web.stanford.edu/class/cs224n/reports/2744372.pdf), it makes sense to give these [medical embeddings](https://github.com/clinicalml/embeddings) a shot.

The first thing to do is figure out how much of the vocabulary exists in these vectors.

In [11]:
medical_note_embeddings = load_stanford_model(filename = 'data/stanford-medical/stanford_cuis_svd_300.w2v', binary=True)

In [20]:
# Vocab size:
len(medical_note_embeddings.vocab)

14042

In [32]:
# Test print out a. basic word:
medical_note_embeddings['4264290']

array([6.3369293e-10], dtype=float32)

In [29]:
medical_note_embeddings.vocab.keys()

dict_keys(['22705', '4264290', '00561', '0737', '04360', '9097', '7851', '03298', '0734', '11694', '5362', '29435', '3666', '04242', '7212', '00477', '0853', '01089', '1333', '00799', '8370', '0591', '10120', '8461', '08204', '9732', '12373', '1205', '02730', '5454', '11888', '3084', '10052', '2462', '12423', '7183', '63248', '6171', '00659', '00898', '8826', '3780', '14334', '9586', '15281', '03724', '1327', '37892', '64672', '5243', '0207', '8607', '44795', '09919', '6733', '6547', '3227', '43934', '0529', '03062', '1397', '20830', '6517', '56007', '2195', '73717', '2712', '01884', '4349', '09492', '8802', '1952', '9443', '07343', '10186', '20819', '9752', '03924', '8001', '33032', '6405', '09359', '1539', '70927', '9520', '02321', '9631', '48815', '01347', '7695', '07751', '07607', '02307', '6495', '32638', '01244', '03348', '14012', '5090', '67337', '12327', '04950', '1252', '1071', '55047', '40707', '3756', '7934', '40892', '7471', '0288', '07017', '1388', '58256', '04994', '12610

In [17]:
pca_med_result, safe_words_med, not_safe_words_med = create_pca(medical_note_embeddings)

[]
AND
['abdominal', 'acetaminophen', 'activated', 'actively', 'acute', 'acutely', 'adhd', 'admissions', 'admitted', 'adrenal', 'adverse', 'agitation', 'agreed', 'airway', 'alcoholic', 'alcoholism', 'alert', 'allergies', 'allergy', 'altered', 'ambien', 'ambulating', 'anemia', 'apap', 'apnea', 'appears', 'appendectomy', 'appetite', 'approx', 'artery', 'aspiration', 'aspirin', 'asthma', 'ativan', 'atrial', 'attempted', 'attempts', 'attending', 'awake', 'axis', 'barbiturates', 'baseline', 'been', 'began', 'believed', 'benadryl', 'benzo', 'benzodiazepine', 'benzodiazepines', 'benzos', 'biopsy', 'bipolar', 'bleeding', 'borderline', 'bottles', 'bowel', 'breathing', 'brought', 'bypass', 'calf', 'called', 'came', 'cardiac', 'cardiology', 'cardiothoracic', 'cardiovascular', 'cath', 'cervical', 'changes', 'charcoal', 'children', 'cholecystectomy', 'chronic', 'cirrhosis', 'ciwa', 'cleared', 'clonazepam', 'cocaine', 'codeine', 'coherent', 'colectomy', 'colon', 'colonic', 'compartment', 'complicati

ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.