## Loading Libraries

In [None]:
!pip install pytorch-pretrained-bert tsne
!pip install tensorflow_text
!pip install bokeh
!pip install simpleneighbors[annoy]
!pip install tqdm

#@title Setup common imports and functions
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise

from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
% matplotlib inline

#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

## Sub-Word Visualization
    
### Parameters:

*   **text** - (*string*) inut sentence
*   **dimensionality_reduction** - (*string*) 'pca' for Principal Component Analysis or 'tsne' for TSNE
*   **sum** - (*boolean*) True to sum the vectors from the last four layers.
*   **concat** - (*boolean*) True to concatenate the vectors (that is, append them together) from the last four layers.

### Steps:
1.   Load pre-trained tokenizer and base model
2.   Add the special tokens [CLS] and [SEP] to the input string 
3.   Split the sentence into tokens genetating its respective sub-words if the full word isn't in the model's vocabulary 
4.   Put the model in "evaluation" mode, meaning feed-forward operation
5.   Predict hidden states features for each layer
6.   Concatenate the tensors for all layers. We use 'stack' here to create a new dimension in the tensor
7.   Stores the token vectors, with shape [22 x 3,072]
8.   Concatenate the vectors (that is, append them together) from the last four layers OR sum the vectors from the last four layers
9.   Apply dimensionality reduction using either PCA or TSNE
10.  Utilizing plotly to interactively visualize the 2D word vectors






In [None]:
 def subword_visualization(text,dimensionality_reduction, sum=False, concat=False):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)
    print("Tokenized",tokenized_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-base-uncased')

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()

    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)


    if concat:
        print("\nVector Concatination Visualization")
        # Stores the token vectors, with shape [22 x 3,072]
        token_vecs_cat = []

        # `token_embeddings` is a [22 x 12 x 768] tensor.

        # For each token in the sentence...
        for token in token_embeddings:
            
            # `token` is a [12 x 768] tensor

            # Concatenate the vectors (that is, append them together) from the last 
            # four layers.
            # Each layer vector is 768 values, so `cat_vec` is length 3,072.
            cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            
            # Use `cat_vec` to represent `token`.
            token_vecs_cat.append(cat_vec)
        
        token_vecs_notensor = []
        for token in token_vecs_cat:
            token_vector = []
            for point in token:
                token_vector.append(point.item())
            token_vecs_notensor.append(token_vector)
        

        dictionary = {}

        if dimensionality_reduction=='pca':
            pca = PCA(n_components=2)
            result = pca.fit_transform(token_vecs_notensor)
        else:
            tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
            result = tsne.fit_transform(token_vecs_notensor)

        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1:
                dictionary[k] = (result[count][0],result[count][1])
        print(dictionary)

        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig = go.Figure(data=go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        fig.show()
    if sum:
        print("Vector Summation Visualization")
        # Stores the token vectors, with shape [22 x 768]
        token_vecs_sum = []

        # `token_embeddings` is a [22 x 12 x 768] tensor.

        # For each token in the sentence...
        for token in token_embeddings:

            # `token` is a [12 x 768] tensor

            # Sum the vectors from the last four layers.
            sum_vec = torch.sum(token[-4:], dim=0)
            
            # Use `sum_vec` to represent `token`.
            token_vecs_sum.append(sum_vec)
        
        token_vecs_notensor = []
        for token in token_vecs_sum:
            token_vector = []
            for point in token:
                token_vector.append(point.item())
            token_vecs_notensor.append(token_vector)

        dictionary = {}

        if dimensionality_reduction=='pca':
            pca = PCA(n_components=2)
            result = pca.fit_transform(token_vecs_notensor)
        else:
            tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
            result = tsne.fit_transform(token_vecs_notensor)

        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1:
                dictionary[k] = (result[count][0],result[count][1])
        print(dictionary)
        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig = go.Figure(data=go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        fig.show()


In [None]:
subword_visualization("This is my dog",dimensionality_reduction='pca', sum=True, concat=True)

## Word Visualization
    
### Parameters:

*   **text** - (*string*) inut sentence
*   **dimensionality_reduction** - (*string*) 'pca' for Principal Component Analysis or 'tsne' for TSNE
*   **sum** - (*boolean*) True to sum the vectors from the last four layers.
*   **concat** - (*boolean*) True to concatenate the vectors (that is, append them together) from the last four layers.

### Steps:
1.   Load pre-trained tokenizer and base model
2.   Add the special tokens [CLS] and [SEP] to the input string 
3.   Split the sentence into tokens genetating its respective sub-words if the full word isn't in the model's vocabulary 
4.   Put the model in "evaluation" mode, meaning feed-forward operation
5.   Predict hidden states features for each layer
6.   Concatenate the tensors for all layers. We use 'stack' here to create a new dimension in the tensor
7.   Stores the token vectors, with shape [22 x 3,072]
8.   Concatenate the vectors (that is, append them together) from the last four layers OR sum the vectors from the last four layers
9.   Apply dimensionality reduction using either PCA or TSNE
10.  Average out the vectors of the sub-words to form a vector for the full-word (concactination of sub-words)
11.  Utilizing plotly to interactively visualize the 2D word vectors






In [None]:
def word_visualization(text,dimensionality_reduction, sum=False, concat=False):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)
    print("Tokenized",tokenized_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-base-uncased')

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()

    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)


    if concat:
        print("\nVector Concatination Visualization")
        # Stores the token vectors, with shape [22 x 3,072]
        token_vecs_cat = []

        # `token_embeddings` is a [22 x 12 x 768] tensor.

        # For each token in the sentence...
        for token in token_embeddings:
            
            # `token` is a [12 x 768] tensor

            # Concatenate the vectors (that is, append them together) from the last 
            # four layers.
            # Each layer vector is 768 values, so `cat_vec` is length 3,072.
            cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            
            # Use `cat_vec` to represent `token`.
            token_vecs_cat.append(cat_vec)
        
        token_vecs_notensor = []
        for token in token_vecs_cat:
            token_vector = []
            for point in token:
                token_vector.append(point.item())
            token_vecs_notensor.append(token_vector)

        dictionary = {}

        if dimensionality_reduction=='pca':
            pca = PCA(n_components=2)
            result = pca.fit_transform(token_vecs_notensor)
        else:
            tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
            result = tsne.fit_transform(token_vecs_notensor)

        skip_list = -1
        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1 and count>skip_list:
                if tokenized_text[count][0]=='#' and count>skip_list:
                    coord_avg = []
                    base_word = tokenized_text[count-1]
                    coord_avg.append((result[count-1][0],result[count-1][1]))

                    full_word = base_word
                    for i in range(10):
                        if tokenized_text[count+i][0]=='#':
                            full_word = full_word + tokenized_text[count+i]
                            coord_avg.append((result[count+i][0],result[count+i][1]))
                        else:
                            coord_avg.append((result[count][0],result[count][1]))                            
                            skip_list = count+i-1 

                            # TRY SUM OR CONCAT
                            averaged = np.average(coord_avg, axis=0)
                            dictionary[full_word] = (averaged[0],averaged[1])
                            break
                elif tokenized_text[count+1][0]=='#':   
                    continue
                else:
                    dictionary[k] = (result[count][0],result[count][1])
        print(dictionary)
        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig = go.Figure(data=go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        fig.show()
    
    if sum:
        print("Vector Summation Visualization")
        # Stores the token vectors, with shape [22 x 768]
        token_vecs_sum = []

        # `token_embeddings` is a [22 x 12 x 768] tensor.

        # For each token in the sentence...
        for token in token_embeddings:

            # `token` is a [12 x 768] tensor

            # Sum the vectors from the last four layers.
            sum_vec = torch.sum(token[-4:], dim=0)
            
            # Use `sum_vec` to represent `token`.
            token_vecs_sum.append(sum_vec)
        
        token_vecs_notensor = []
        for token in token_vecs_sum:
            token_vector = []
            for point in token:
                token_vector.append(point.item())
            token_vecs_notensor.append(token_vector)

        dictionary = {}

        if dimensionality_reduction=='pca':
            pca = PCA(n_components=2)
            result = pca.fit_transform(token_vecs_notensor)
        else:
            tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
            result = tsne.fit_transform(token_vecs_notensor)

        skip_list = -1
        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1 and count>skip_list:
                if tokenized_text[count][0]=='#' and count>skip_list:
                    coord_avg = []
                    base_word = tokenized_text[count-1]
                    coord_avg.append((result[count-1][0],result[count-1][1]))

                    full_word = base_word
                    for i in range(10):
                        if tokenized_text[count+i][0]=='#':
                            full_word = full_word + tokenized_text[count+i]
                            coord_avg.append((result[count+i][0],result[count+i][1]))
                        else:
                            coord_avg.append((result[count][0],result[count][1]))                            
                            skip_list = count+i-1 

                            # TRY SUM OR CONCAT
                            averaged = np.average(coord_avg, axis=0)
                            dictionary[full_word] = (averaged[0],averaged[1])
                            break
                elif tokenized_text[count+1][0]=='#':   
                    continue
                else:
                    dictionary[k] = (result[count][0],result[count][1])
        print(dictionary)
        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig = go.Figure(data=go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        fig.show()
    
        

In [None]:
word_visualization("john john",dimensionality_reduction='pca', sum=True, concat=True)

## Word-Subword Visualization
    
### Parameters:

*   **text** - (*string*) inut sentence
*   **dimensionality_reduction** - (*string*) 'pca' for Principal Component Analysis or 'tsne' for TSNE
*   **sum** - (*boolean*) True to sum the vectors from the last four layers.
*   **concat** - (*boolean*) True to concatenate the vectors (that is, append them together) from the last four layers.

### Steps:
1.   Load pre-trained tokenizer and base model
2.   Add the special tokens [CLS] and [SEP] to the input string 
3.   Split the sentence into tokens genetating its respective sub-words if the full word isn't in the model's vocabulary 
4.   Put the model in "evaluation" mode, meaning feed-forward operation
5.   Predict hidden states features for each layer
6.   Concatenate the tensors for all layers. We use 'stack' here to create a new dimension in the tensor
7.   Stores the token vectors, with shape [22 x 3,072]
8.   Concatenate the vectors (that is, append them together) from the last four layers OR sum the vectors from the last four layers
9.   Apply dimensionality reduction using either PCA or TSNE
10.  Average out the vectors of the sub-words to form a vector for the full-word (concactination of sub-words)
11.  Utilizing plotly to interactively visualize both the concatinated subwords and the subwords seperately






In [None]:
def word_subword_visualization(text,dimensionality_reduction, sum=False, concat=False):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)
    print("Tokenized",tokenized_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-base-uncased')

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()

    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)


    if concat:
        print("\nVector Concatination Visualization")
        # Stores the token vectors, with shape [22 x 3,072]
        token_vecs_cat = []

        # `token_embeddings` is a [22 x 12 x 768] tensor.

        # For each token in the sentence...
        for token in token_embeddings:
            
            # `token` is a [12 x 768] tensor

            # Concatenate the vectors (that is, append them together) from the last 
            # four layers.
            # Each layer vector is 768 values, so `cat_vec` is length 3,072.
            cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            
            # Use `cat_vec` to represent `token`.
            token_vecs_cat.append(cat_vec)
        
        token_vecs_notensor = []
        for token in token_vecs_cat:
            token_vector = []
            for point in token:
                token_vector.append(point.item())
            token_vecs_notensor.append(token_vector)

        dictionary_one = {}

        if dimensionality_reduction=='pca':
            pca = PCA(n_components=2)
            result = pca.fit_transform(token_vecs_notensor)
        else:
            tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
            result = tsne.fit_transform(token_vecs_notensor)

        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1:
                dictionary_one[k] = (result[count][0],result[count][1])
        print(dictionary_one)
        
        fig = go.Figure()

        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary_one.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig.add_trace(go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        marker_line_color="midnightblue", marker_color="lightskyblue", 
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))

        dictionary = {} 
        skip_list = -1
        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1 and count>skip_list:
                if tokenized_text[count][0]=='#' and count>skip_list:
                    coord_avg = []
                    base_word = tokenized_text[count-1]
                    coord_avg.append((result[count-1][0],result[count-1][1]))

                    full_word = base_word
                    for i in range(10):
                        if tokenized_text[count+i][0]=='#':
                            full_word = full_word + tokenized_text[count+i]
                            coord_avg.append((result[count+i][0],result[count+i][1]))
                        else:
                            coord_avg.append((result[count][0],result[count][1]))                            
                            skip_list = count+i-1 

                            # TRY SUM OR CONCAT
                            averaged = np.average(coord_avg, axis=0)
                            dictionary[full_word] = (averaged[0],averaged[1])
                            break
                elif tokenized_text[count+1][0]=='#':   
                    continue
                else:
                    dictionary[k] = (result[count][0],result[count][1])
        print(dictionary)
        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig.add_trace(go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here
        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        
        fig.show()
    
    if sum:
        print("Vector Summation Visualization")
        # Stores the token vectors, with shape [22 x 768]
        token_vecs_sum = []

        # `token_embeddings` is a [22 x 12 x 768] tensor.

        # For each token in the sentence...
        for token in token_embeddings:

            # `token` is a [12 x 768] tensor

            # Sum the vectors from the last four layers.
            sum_vec = torch.sum(token[-4:], dim=0)
            
            # Use `sum_vec` to represent `token`.
            token_vecs_sum.append(sum_vec)
        
        token_vecs_notensor = []
        for token in token_vecs_sum:
            token_vector = []
            for point in token:
                token_vector.append(point.item())
            token_vecs_notensor.append(token_vector)

        dictionary_one = {}

        if dimensionality_reduction=='pca':
            pca = PCA(n_components=2)
            result = pca.fit_transform(token_vecs_notensor)
        else:
            tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
            result = tsne.fit_transform(token_vecs_notensor)

        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1:
                dictionary_one[k] = (result[count][0],result[count][1])
        print(dictionary_one)
        
        fig = go.Figure()

        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary_one.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig.add_trace(go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        marker_line_color="midnightblue", marker_color="lightskyblue", 
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))

        dictionary = {} 
        skip_list = -1
        for count, token in enumerate(tokenized_text):
            k = "(" + str(count) + ") " + token
            if count != 0 and count!=len(tokenized_text)-1 and count>skip_list:
                if tokenized_text[count][0]=='#' and count>skip_list:
                    coord_avg = []
                    base_word = tokenized_text[count-1]
                    coord_avg.append((result[count-1][0],result[count-1][1]))

                    full_word = base_word
                    for i in range(10):
                        if tokenized_text[count+i][0]=='#':
                            full_word = full_word + tokenized_text[count+i]
                            coord_avg.append((result[count+i][0],result[count+i][1]))
                        else:
                            coord_avg.append((result[count][0],result[count][1]))                            
                            skip_list = count+i-1 

                            # TRY SUM OR CONCAT
                            averaged = np.average(coord_avg, axis=0)
                            dictionary[full_word] = (averaged[0],averaged[1])
                            break
                elif tokenized_text[count+1][0]=='#':   
                    continue
                else:
                    dictionary[k] = (result[count][0],result[count][1])
        print(dictionary)
        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig.add_trace(go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here
        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        
        fig.show()

In [None]:
word_subword_visualization("embedding embedding",dimensionality_reduction='pca', sum=True, concat=True)

Tokenized ['[CLS]', 'em', '##bed', '##ding', 'em', '##bed', '##ding', '[SEP]']

Vector Concatination Visualization
{'(1) em': (-1.0799341765454715, -16.64670363251038), '(2) ##bed': (22.79913526963101, 3.4318490269519546), '(3) ##ding': (-11.468859762713148, -0.523466751535016), '(4) em': (0.1492580271966144, -19.429030250781985), '(5) ##bed': (24.572567199802783, 4.103602650662588), '(6) ##ding': (-11.4964807491967, 1.3012014284517273)}
{'em##bed##ding': (9.449477919401371, -2.480155880251271)}


Vector Summation Visualization
{'(1) em': (-2.222928151382614, 33.88230097624589), '(2) ##bed': (42.94588158641533, -8.236856529345646), '(3) ##ding': (-22.302469345519462, -3.665674568501639), '(4) em': (-0.11904650591324657, 39.354089178920134), '(5) ##bed': (46.16601170358789, -9.205521569420137), '(6) ##ding': (-22.514714189624762, -7.472111097227203)}
{'em##bed##ding': (17.42456567790944, 3.367733735713165)}


## Sentence Visualization
    
### Parameters:

*   **sentences** - (*list*) list of sentences
*   **dimensionality_reduction** - (*string*) 'pca' for Principal Component Analysis or 'tsne' for TSNE
*   **multi-lingual** - (*boolean*) True to load multi-lingual vatiant of BERT or USE
*   **bert** - (*boolean*) True to load the BERT base or multi-lingual model
*   **use** - (*boolean*) True to load the USE

### Steps:
1.   Load pre-trained tokenizer and base model
2.   Add the special tokens [CLS] and [SEP] to the input string 
3.   Split the sentence into tokens genetating its respective sub-words if the full word isn't in the model's vocabulary 
4.   Put the model in "evaluation" mode, meaning feed-forward operation
5.   Predict hidden states features for each layer
6.   Calculate the average of all the token vectors
7.   Apply dimensionality reduction using either PCA
8.   Utilizing plotly to interactively visualize the sentence vectors






In [None]:
def sentence_visualization(sentences, dimensionality_reduction, multi_lingual=False, bert=False,use=False):

    if bert:
        print("BERT VISUALIZATION")
        if multi_lingual:
            # Load pre-trained model tokenizer (vocabulary)
            tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
            # Load pre-trained model (weights)
            model = BertModel.from_pretrained('bert-base-multilingual-uncased')
        else:
            # Load pre-trained model tokenizer (vocabulary)
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            # Load pre-trained model (weights)
            model = BertModel.from_pretrained('bert-base-uncased')

        # Put the model in "evaluation" mode, meaning feed-forward operation.
        model.eval()

        sentences_embeddings = []
        for sentence in sentences:
            # Add the special tokens.
            marked_text = "[CLS] " + sentence + " [SEP]"

            # Split the sentence into tokens.
            tokenized_text = tokenizer.tokenize(marked_text)
            print(tokenized_text)
            # Map the token strings to their vocabulary indeces.
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

            # Mark each of the tokens as belonging to sentence "1".
            segments_ids = [1] * len(tokenized_text)

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])

            # Predict hidden states features for each layer
            with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)
            
            # `encoded_layers` has shape [12 x 1 x 22 x 768]
            # `token_vecs` is a tensor with shape [22 x 768]
            token_vecs = encoded_layers[11][0]

            # Calculate the average of all 22 token vectors.
            sentence_embedding = torch.mean(token_vecs, dim=0)  
            sentences_embeddings.append(sentence_embedding)
        
        sentence_notensor = []
        for token in sentences_embeddings:
            token_vector = []
            for point in token:
                token_vector.append(point.item())
            print(token_vector)
            sentence_notensor.append(token_vector)

        if dimensionality_reduction=='pca':
            pca = PCA(n_components=2)
            result = pca.fit_transform(sentence_notensor)
        else:
            tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
            result = tsne.fit_transform(sentence_notensor)

        dictionary = {}
        for count, token in enumerate(sentences):
            k = "(" + str(count) + ") " + token
            dictionary[k] = (result[count][0],result[count][1])

        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)
        
        fig = go.Figure(data=go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        fig.show()
    if use:
        print("USE VISUALIZATION")
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(embed(sentences))


        dictionary = {}

        for count, token in enumerate(messages):
            k = "(" + str(count) + ") " + token
            print(pca_result[count])
            dictionary[k] = (pca_result[count][0],pca_result[count][1])

        # repackage data into array-like for matplotlib 
        # (see a preferred pythonic way below)
        data = {"x":[], "y":[], "label":[]}
        for label, coord in dictionary.items():
            data["x"].append(coord[0])
            data["y"].append(coord[1])
            data["label"].append(label)

        
        fig = go.Figure(data=go.Scatter(x=data['x'],
                                        y=data['y'],
                                        mode='markers',
                                        text=data['label'])) # hover text goes here

        fig.update_traces(marker=dict(size=20,
                                    line=dict(width=2,
                                                color='DarkSlateGrey')),
                        selector=dict(mode='markers'))
        fig.show()

In [None]:
messages = [
"This in Ruby Red would be gorgeous",
"Agreed! This will also look good in Matte Black",
"I disagree man, white and only white suits this car",

"Nice Tires",
"They still need to improve the exhaust system",
"You should! look at those wheels!",

"I prefer the Dodge to be honest",
"Shut up man, the Dodge sucks compared to this beauty"
]
# messages = [
#                 'كلب',
#                 'لدي اختبار غدا',
#                 'أستمتع بالمشي لمسافات طويلة على طول الشاطئ مع كلبي.',
#                 'dog',
#                 'i have a test tomorrow',
#                 'I enjoy taking long walks along the beach with my dog.'
#          ]

sentence_visualization(messages, dimensionality_reduction='pca', multi_lingual=False, bert=True,use=True)

BERT VISUALIZATION
['[CLS]', 'this', 'in', 'ruby', 'red', 'would', 'be', 'gorgeous', '[SEP]']
['[CLS]', 'agreed', '!', 'this', 'will', 'also', 'look', 'good', 'in', 'matt', '##e', 'black', '[SEP]']
['[CLS]', 'i', 'disagree', 'man', ',', 'white', 'and', 'only', 'white', 'suits', 'this', 'car', '[SEP]']
['[CLS]', 'nice', 'tires', '[SEP]']
['[CLS]', 'they', 'still', 'need', 'to', 'improve', 'the', 'exhaust', 'system', '[SEP]']
['[CLS]', 'you', 'should', '!', 'look', 'at', 'those', 'wheels', '!', '[SEP]']
['[CLS]', 'i', 'prefer', 'the', 'dodge', 'to', 'be', 'honest', '[SEP]']
['[CLS]', 'shut', 'up', 'man', ',', 'the', 'dodge', 'sucks', 'compared', 'to', 'this', 'beauty', '[SEP]']
[0.09864027053117752, -0.24756282567977905, -0.011182652786374092, -0.0951838493347168, 0.5318976640701294, 0.2410862147808075, 0.2611576318740845, 0.17167708277702332, 0.16763629019260406, -0.28119170665740967, 0.40656664967536926, -0.21375367045402527, 0.07283541560173035, 0.6034670472145081, 0.04117737710475921

USE VISUALIZATION
[-0.48722478 -0.12172049]
[-0.36339001 -0.2685214 ]
[-0.37675397 -0.15500826]
[ 0.63911743 -0.15484253]
[ 0.11486738 -0.37227541]
[ 0.4930136  -0.12868588]
[-0.05783772  0.5831011 ]
[0.03820806 0.61795286]


# Multi-Word Visualization

In [None]:
 def multi_word_visualization(list_of_inputs,dimensionality_reduction, sum=False, concat=True):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-base-uncased')

    dictionary_reduced = {}
    dictionary = {}
    for input_count, text in enumerate(list_of_inputs):

        # Add the special tokens.
        marked_text = "[CLS] " + text + " [SEP]"

        # Split the sentence into tokens.
        tokenized_text = tokenizer.tokenize(marked_text)
        print("Tokenized",tokenized_text)

        # Map the token strings to their vocabulary indeces.
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Mark each of the tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Put the model in "evaluation" mode, meaning feed-forward operation.
        model.eval()

        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers, _ = model(tokens_tensor, segments_tensors)

        # Concatenate the tensors for all layers. We use `stack` here to
        # create a new dimension in the tensor.
        token_embeddings = torch.stack(encoded_layers, dim=0)

        # Remove dimension 1, the "batches".
        token_embeddings = torch.squeeze(token_embeddings, dim=1)

        # Swap dimensions 0 and 1.
        token_embeddings = token_embeddings.permute(1,0,2)


        if concat:
            # Stores the token vectors, with shape [22 x 3,072]
            token_vecs_cat = []

            # `token_embeddings` is a [22 x 12 x 768] tensor.

            # For each token in the sentence...
            for token in token_embeddings:
                
                # `token` is a [12 x 768] tensor

                # Concatenate the vectors (that is, append them together) from the last 
                # four layers.
                # Each layer vector is 768 values, so `cat_vec` is length 3,072.
                cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
                
                # Use `cat_vec` to represent `token`.
                token_vecs_cat.append(cat_vec)
            
            token_vecs_notensor = []
            for token in token_vecs_cat:
                token_vector = []
                for point in token:
                    token_vector.append(point.item())
                token_vecs_notensor.append(token_vector)
        

            if dimensionality_reduction=='pca':
                pca = PCA(n_components=2)
                result = pca.fit_transform(token_vecs_notensor)
            else:
                tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
                result = tsne.fit_transform(token_vecs_notensor)

            for count, token in enumerate(tokenized_text):
                k =  token + " [sentence #" + str(input_count) + "] [word #" + str(count) + "]"
                if count != 0 and count!=len(tokenized_text)-1:
                    dictionary_reduced[k] = (result[count][0],result[count][1])
                    dictionary[k] = token_vecs_notensor[count]
                
    print(dictionary_reduced)

    # repackage data into array-like for matplotlib 
    # (see a preferred pythonic way below)
    data = {"x":[], "y":[], "label":[]}
    for label, coord in dictionary_reduced.items():
        data["x"].append(coord[0])
        data["y"].append(coord[1])
        data["label"].append(label)
    
    fig = go.Figure(data=go.Scatter(x=data['x'],
                                    y=data['y'],
                                    mode='markers',
                                    text=data['label'])) # hover text goes here

    fig.update_traces(marker=dict(size=20,
                                line=dict(width=2,
                                            color='DarkSlateGrey')),
                    selector=dict(mode='markers'))
    fig.show()

    return dictionary

In [None]:
multi_dict = multi_word_visualization(["bank robber","bank vault","river bank"],dimensionality_reduction='pca')

Tokenized ['[CLS]', 'bank', 'robber', '[SEP]']
Tokenized ['[CLS]', 'bank', 'vault', '[SEP]']
Tokenized ['[CLS]', 'river', 'bank', '[SEP]']
{'bank [sentence #0] [word #1]': (-17.18734933203432, -7.323111980521776), 'robber [sentence #0] [word #2]': (-6.602119659802937, -10.104363724016), 'bank [sentence #1] [word #1]': (-8.631383569285937, 20.64388918080782), 'vault [sentence #1] [word #2]': (-6.135055805600675, -0.23236043482126578), 'river [sentence #2] [word #1]': (-1.2816710963283529, -16.91785143389458), 'bank [sentence #2] [word #2]': (-8.47407342719562, -7.8684955518815185)}


In [None]:
def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
                         plot_title,
                         plot_width=1200, plot_height=600,
                         xaxis_font_size='12pt', yaxis_font_size='12pt'):

  assert len(embeddings_1) == len(labels_1)
  assert len(embeddings_2) == len(labels_2)

  
  sim = 1 - np.arccos(
      sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
                                                 embeddings_2))/np.pi

  embeddings_1_col, embeddings_2_col, sim_col = [], [], []
  for i in range(len(embeddings_1)):
    for j in range(len(embeddings_2)):
      embeddings_1_col.append(labels_1[i])
      embeddings_2_col.append(labels_2[j])
      sim_col.append(sim[i][j])
  df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                    columns=['embeddings_1', 'embeddings_2', 'sim'])

  mapper = bokeh.models.LinearColorMapper(
      palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
      high=df.sim.max())

  p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
                            x_axis_location="above",
                            y_range=[*reversed(labels_2)],
                            plot_width=plot_width, plot_height=plot_height,
                            tools="save",toolbar_location='below', tooltips=[
                                ('pair', '@embeddings_1 ||| @embeddings_2'),
                                ('sim', '@sim')])
  p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
         fill_color={'field': 'sim', 'transform': mapper}, line_color=None)

  p.title.text_font_size = '12pt'
  p.axis.axis_line_color = None
  p.axis.major_tick_line_color = None
  p.axis.major_label_standoff = 16
  p.xaxis.major_label_text_font_size = xaxis_font_size
  p.xaxis.major_label_orientation = 0.25 * np.pi
  p.yaxis.major_label_text_font_size = yaxis_font_size
  p.min_border_right = 300

  bokeh.io.output_notebook()
  bokeh.io.show(p)


In [None]:
embedding = list(multi_dict.values())
labels = list(multi_dict.keys())
visualize_similarity(embedding, embedding,
                     labels, labels,  "Vector Similarity")



invalid value encountered in arccos

