<a href="https://colab.research.google.com/github/Alexjmsherman/test_repo/blob/master/pytorch_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install spacy-transformers

In [0]:
!python -m spacy download en_core_web_lg

In [0]:
!python -m spacy download en_trf_distilbertbaseuncased_lg

In [0]:
!python -m spacy download en_trf_robertabase_lg

In [0]:
!python -m spacy download en_trf_xlnetbasecased_lg

In [0]:
!python -m spacy download en_trf_bertbaseuncased_lg

In [0]:
from itertools import combinations
import os
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.util import minibatch
import torch

In [0]:
is_using_gpu = spacy.prefer_gpu()
print(f'is_using_gpu: {is_using_gpu}')
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load('en_trf_distilbertbaseuncased_lg')

In [0]:
# download NIH Project Data
!wget https://exporter.nih.gov/CSVs/final/RePORTER_PRJ_C_FY2017.zip

In [0]:
REPORTER_DATA_PATH = r'RePORTER_PRJ_C_FY2017.zip'

# view the data
df = pd.read_csv(
    REPORTER_DATA_PATH,
    encoding='latin-1'  # common encoding to handle messy data
)

# filter to relevant columns
df = df[['ADMINISTERING_IC', 'FY',  'IC_NAME', 'PROJECT_TITLE']]

# convert IC counts to a dataframe
top_ic = df['IC_NAME'].value_counts().reset_index()

# filter to top ICs
top_ic_names = top_ic[top_ic.IC_NAME > 1500]['index'][0:2]

# view new data subset
df = df[df['IC_NAME'].isin(top_ic_names)]

# set the labels as a new column
df['IC_NUM'] = df['ADMINISTERING_IC'].factorize()[0]

# create a map of IC nums to names for later reference
ic_name_map = {num:name for num, name in df[['IC_NUM','IC_NAME']].drop_duplicates().values}

## Pytorch Transformers

In [0]:
#title = df['PROJECT_TITLE'].values[0]

#doc = nlp(title)
doc = nlp("Apple shares rose on the news news")

In [0]:
print(dir(doc._))

['get', 'has', 'set', 'trf_alignment', 'trf_all_attentions', 'trf_all_hidden_states', 'trf_d_all_attentions', 'trf_d_all_hidden_states', 'trf_d_last_hidden_state', 'trf_d_pooler_output', 'trf_end', 'trf_last_hidden_state', 'trf_pooler_output', 'trf_segments', 'trf_separator', 'trf_start', 'trf_word_pieces', 'trf_word_pieces_']


In [0]:
doc._.trf_alignment

[[1], [2], [3], [4], [5], [6], [7]]

In [0]:
doc._.trf_all_attentions

In [0]:
doc._.trf_all_hidden_states

In [0]:
doc._.trf_d_all_attentions

[]

In [0]:
doc._.trf_d_all_hidden_states

[]

In [0]:
doc._.trf_d_last_hidden_state

array([], shape=(0, 0), dtype=float32)

In [0]:
doc._.trf_d_pooler_output

array([], shape=(0, 0), dtype=float32)

In [0]:
doc._.trf_end

8

In [0]:
print(f'len of doc._.trf_last_hidden_state: {len(doc._.trf_last_hidden_state)}\n\n')
doc._.trf_last_hidden_state

len of doc._.trf_last_hidden_state: 9




array([[ 0.10535385,  0.06188994,  0.21569681, ..., -0.27822948,
         0.4762323 ,  0.08384846],
       [ 0.25369814, -0.14087969,  0.02343122, ..., -0.7123963 ,
        -0.11077517, -0.36199537],
       [ 1.0855564 , -0.07643908,  0.34801832, ..., -0.2726254 ,
         0.43775403,  0.36212504],
       ...,
       [ 0.2143961 , -0.1587854 ,  0.33104733, ...,  0.23896854,
         0.7185059 , -0.76811516],
       [ 0.34332028, -0.36023057,  0.05682122, ...,  0.40013903,
         0.4216599 , -0.381824  ],
       [ 0.84120077,  0.02385291, -0.19828965, ...,  0.34951913,
        -0.20228928, -0.2891109 ]], dtype=float32)

In [0]:
doc._.trf_pooler_output


array([[-8.83356214e-01, -3.59564573e-01,  1.44565597e-01,
         6.38517141e-01, -4.72972468e-02, -2.11414471e-01,
         9.13932621e-01,  2.93855518e-01, -1.75617531e-01,
        -9.99945998e-01,  6.96694255e-02,  5.73827565e-01,
         9.79522884e-01, -1.85503095e-01,  9.24535215e-01,
        -6.14944518e-01, -4.12149608e-01, -4.69880491e-01,
         4.00395721e-01, -7.36149192e-01,  7.68896401e-01,
         9.90317643e-01,  4.05995697e-01,  3.44575346e-01,
         4.97720212e-01,  7.17405498e-01, -6.62333548e-01,
         9.21310723e-01,  9.49000776e-01,  7.52894580e-01,
        -7.45440662e-01,  2.79127717e-01, -9.82349157e-01,
        -2.52880275e-01,  3.66553129e-03, -9.84831274e-01,
         2.35897869e-01, -7.80724645e-01, -7.11967275e-02,
        -7.58032203e-02, -8.94781113e-01,  3.19197297e-01,
         9.99478877e-01, -2.16222748e-01,  2.10857525e-01,
        -3.80970776e-01, -9.99996006e-01,  2.57858068e-01,
        -8.61357152e-01,  7.01683462e-02,  1.92019511e-0

In [0]:
for i in doc._.trf_segments:
  print(i, '\n')

Apple shares rose on the news news 



In [0]:
doc._.trf_separator

In [0]:
doc._.trf_start

0

In [0]:
doc._.trf_word_pieces

[101, 6207, 6661, 3123, 2006, 1996, 2739, 2739, 102]

In [0]:
doc._.trf_word_pieces_

['[CLS]', 'apple', 'shares', 'rose', 'on', 'the', 'news', 'news', '[SEP]']

In [0]:


data = []

word_emb = list(zip(doc._.trf_word_pieces_, doc._.trf_last_hidden_state))


for state in combinations(word_emb, 2):
  
  word1 = np.array(state[0][0])
  word2 = np.array(state[1][0])
  embedding1 = np.array(state[0][1]).reshape(1, -1) 
  embedding2 = np.array(state[1][1]).reshape(1, -1) 
  
  score = cosine_similarity(embedding1, embedding2)
  
  data.append([word1.item(), word2.item(), score.item()])
  
sorted(data, key=lambda x: x[2])

In [0]:
def get_wordpiece(nlp, model_name): 
  print(f'\n\nmodel name: {model_name}')
  doc = nlp("An apple is a fruit")
  print(f'len of doc._.trf_last_hidden_state: {len(doc._.trf_last_hidden_state)}')
  print(f'doc._.trf_alignment: {doc._.trf_alignment}')
  print(f'doc._.trf_word_pieces_: {doc._.trf_word_pieces_}')
  
  
nlp = spacy.load('en_trf_distilbertbaseuncased_lg')
get_wordpiece(nlp, 'en_trf_distilbertbaseuncased_lg')

nlp = spacy.load('en_trf_robertabase_lg')
get_wordpiece(nlp, 'en_trf_robertabase_lg')

nlp = spacy.load('en_trf_xlnetbasecased_lg')
get_wordpiece(nlp, 'en_trf_xlnetbasecased_lg')

nlp = spacy.load('en_trf_bertbaseuncased_lg')
get_wordpiece(nlp, 'en_trf_bertbaseuncased_lg')



model name: en_trf_distilbertbaseuncased_lg
len of doc._.trf_last_hidden_state: 7
doc._.trf_alignment: [[1], [2], [3], [4], [5]]
doc._.trf_word_pieces_: ['[CLS]', 'an', 'apple', 'is', 'a', 'fruit', '[SEP]']


model name: en_trf_robertabase_lg
len of doc._.trf_last_hidden_state: 7
doc._.trf_alignment: [[1], [2], [3], [4], [5]]
doc._.trf_word_pieces_: ['<s>', 'An', 'Ġapple', 'Ġis', 'Ġa', 'Ġfruit', '</s>']


model name: en_trf_xlnetbasecased_lg
len of doc._.trf_last_hidden_state: 7
doc._.trf_alignment: [[0], [1], [2], [3], [4]]
doc._.trf_word_pieces_: ['▁An', '▁apple', '▁is', '▁a', '▁fruit', '</s>', '<cls>']


model name: en_trf_bertbaseuncased_lg
len of doc._.trf_last_hidden_state: 7
doc._.trf_alignment: [[1], [2], [3], [4], [5]]
doc._.trf_word_pieces_: ['[CLS]', 'an', 'apple', 'is', 'a', 'fruit', '[SEP]']


['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']


In [0]:
def eval_model(nlp, model_name):
  
  print(f'\n\nmodel name: {model_name}')
  apple_fruit1 = nlp("An apple is a fruit")
  apple_fruit2 = nlp("I ate the tasty apple from the tree")
  apple_company = nlp("phones made by Apple have large screens and cell service")
  
  fruit1 = np.array(apple_fruit1._.trf_last_hidden_state[0]).reshape(1, -1) 
  fruit2 = np.array(apple_fruit2._.trf_last_hidden_state[0]).reshape(1, -1) 
  company = np.array(apple_company._.trf_last_hidden_state[0]).reshape(1, -1) 
  
  print('cls similarity')
  print(f'fruit1, fruit2: {cosine_similarity(fruit1, fruit2)}')
  print(f'fruit1, company: {cosine_similarity(fruit1, company)}')
  print(f'fruit2, company: {cosine_similarity(fruit2, company)}')
        
  print('\ndoc similarity')
  print(f'fruit1, fruit2:  {apple_fruit1.similarity(apple_fruit2)}')
  print(f'fruit1, company: {apple_fruit1.similarity(apple_company)}')
  print(f'fruit2, company: {apple_fruit2.similarity(apple_company)}')

  print('\ntoken similarity')
  print(f'fruit1, fruit2:  {apple_fruit1[1].similarity(apple_fruit2[4])}')
  print(f'fruit1, company: {apple_fruit1[1].similarity(apple_company[3])}')
  print(f'fruit2, company: {apple_fruit2[4].similarity(apple_company[3])}')

#nlp = spacy.load('en_core_web_lg')
#eval_model(nlp, 'en_core_web_lg')

nlp = spacy.load('en_trf_distilbertbaseuncased_lg')
eval_model(nlp, 'en_trf_distilbertbaseuncased_lg')

nlp = spacy.load('en_trf_robertabase_lg')
eval_model(nlp, 'en_trf_robertabase_lg')

nlp = spacy.load('en_trf_xlnetbasecased_lg')
eval_model(nlp, 'en_trf_xlnetbasecased_lg')

nlp = spacy.load('en_trf_bertbaseuncased_lg')
eval_model(nlp, 'en_trf_bertbaseuncased_lg')



model name: en_trf_distilbertbaseuncased_lg
cls similarity
fruit1, fruit2: [[0.95431364]]
fruit1, company: [[0.9543853]]
fruit2, company: [[0.9451714]]

doc similarity
fruit1, fruit2:  0.7214712608055848
fruit1, company: 0.6150731677921323
fruit2, company: 0.6333489467341019

token similarity
fruit1, fruit2:  0.830289363861084
fruit1, company: 0.6710572242736816
fruit2, company: 0.6262279748916626


model name: en_trf_robertabase_lg
cls similarity
fruit1, fruit2: [[0.9982006]]
fruit1, company: [[0.997522]]
fruit2, company: [[0.99718994]]

doc similarity
fruit1, fruit2:  0.9667617999997291
fruit1, company: 0.9471189378034655
fruit2, company: 0.9599576470452009

token similarity
fruit1, fruit2:  0.9708057641983032
fruit1, company: 0.9294208288192749
fruit2, company: 0.9238879084587097


model name: en_trf_xlnetbasecased_lg
cls similarity
fruit1, fruit2: [[0.86462605]]
fruit1, company: [[0.869514]]
fruit2, company: [[0.8624132]]

doc similarity
fruit1, fruit2:  0.9847751440301509
fruit1