# UMAP Analysis

Based on the following [UMAP library](https://umap-learn.readthedocs.io).

In [19]:
# Preambe
%matplotlib inline
import sys
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot
import warnings
import plotly as py
import plotly.graph_objs as go
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from mpl_toolkits.mplot3d import Axes3D
py.offline.init_notebook_mode(connected=True)
warnings.filterwarnings('ignore')

In [2]:
import umap

Bring in all the data and the vectors.

In [3]:
%store -r model_words # List of negative model words

no stored variable # List of negative model words


In [12]:
def words_to_vectors(model, words):
    "Create the word to vector mapping"
    safe_words = []
    not_safe_words = []
    
    # Figure out which words are in the vocab:
    all_words = sorted(words)
    for word in all_words:
        try:
            vectors = model[word]
            safe_words.append([word, vectors])
        except:
            not_safe_words.append(word)
    
    return safe_words, not_safe_words

Load in the Stanford model:

In [6]:
def load_stanford_model(filename, binary=False):
    "Load the vectors from the given filename"
    return KeyedVectors.load_word2vec_format(filename, binary=binary)

In [9]:
stanford_model = load_stanford_model(filename = 'data/glove.6B.50d.w2v')

In [10]:
stanford_model

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x2b9ea719d208>

In [13]:
safe_words, not_safe_words = words_to_vectors(stanford_model, model_words)

In [34]:
df = pd.DataFrame(safe_words)

In [32]:
df.head()

Unnamed: 0,0,1
0,abdominal,"[1.1632, 0.029568, -0.7263, -0.3936, -0.52519,..."
1,acetaminophen,"[1.0553, -0.1131, -0.0013923, -1.1401, 0.13013..."
2,activated,"[0.217, -0.86111, 0.92143, 0.23773, 0.26972, -..."
3,actively,"[0.18831, -1.0536, -0.019578, -0.43794, 0.4318..."
4,acute,"[1.409, 0.51897, -0.34342, -0.57212, -0.76818,..."


### UMAP

In [11]:
fit = umap.UMAP()

In [22]:
test = np.random.rand(800, 4)

In [44]:
test[0:2]

array([[0.0157365 , 0.6377454 , 0.60186539, 0.3354363 ],
       [0.46611648, 0.8570292 , 0.65106412, 0.61984542]])

In [57]:
df[[1]].values[0]

array([array([ 1.1632  ,  0.029568, -0.7263  , -0.3936  , -0.52519 ,  1.6786  ,
        1.3173  ,  1.1164  ,  0.666   , -0.7813  ,  0.67096 , -0.62784 ,
       -0.43586 ,  0.43591 , -0.31903 ,  0.022634, -1.7691  , -0.80166 ,
       -0.056133, -0.020887, -1.1804  , -0.21833 ,  0.71093 , -0.22254 ,
       -0.34991 ,  0.26548 ,  0.21491 ,  0.7855  ,  0.23067 ,  0.39581 ,
        1.6828  ,  1.4606  ,  1.4138  ,  0.13999 , -0.58025 ,  1.1664  ,
        0.83934 ,  0.86103 ,  1.2071  ,  0.53773 , -0.13691 ,  0.64863 ,
       -1.0715  ,  0.5535  ,  0.593   , -0.099699,  1.0604  ,  0.043364,
       -0.67816 ,  0.67794 ], dtype=float32)], dtype=object)

In [52]:
f1 = fit.fit_transform(df[[1]].values[0][0])

ValueError: Expected 2D array, got 1D array instead:
array=[ 1.1632    0.029568 -0.7263   -0.3936   -0.52519   1.6786    1.3173
  1.1164    0.666    -0.7813    0.67096  -0.62784  -0.43586   0.43591
 -0.31903   0.022634 -1.7691   -0.80166  -0.056133 -0.020887 -1.1804
 -0.21833   0.71093  -0.22254  -0.34991   0.26548   0.21491   0.7855
  0.23067   0.39581   1.6828    1.4606    1.4138    0.13999  -0.58025
  1.1664    0.83934   0.86103   1.2071    0.53773  -0.13691   0.64863
 -1.0715    0.5535    0.593    -0.099699  1.0604    0.043364 -0.67816
  0.67794 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.