# Extract Embeddings from Input

In [4]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from numpy.linalg import norm
from invoke import run, exceptions

### Extract Embeddings

In [36]:
bucket_name = 'ekaba-assets'
model_dir = 'biomedbert_large_bert_weights_and_vocab'
config = 'large_bert_config.json'
vocab = 'vocab.txt'

In [37]:
checkpoint = tf.train.latest_checkpoint('gs://{}/{}'.format(bucket_name, model_dir))
voc_fname = 'gs://{}/{}/{}'.format(bucket_name, model_dir, vocab)
config_fname = 'gs://{}/{}/{}'.format(bucket_name, model_dir, config)

In [38]:
data = pd.read_pickle('biosaq_format.pkl')

In [39]:
data.head()

Unnamed: 0,question,answer
0,inheritance pattern li fraumeni syndrome,balanced q q tp breast cancer patient li fraum...
1,inheritance pattern li fraumeni syndrome,genetic modeling li fraumeni syndrome zebrafis...
2,type lung cancer afatinib used,clinical perspective afatinib non small cell l...
3,hormone abnormalities characteristic pendred s...,doca sensitive pendrin expression kidney heart...
4,hormone abnormalities characteristic pendred s...,clinical molecular characteristics pendred syn...


In [40]:
# convert series to .txt file
def series_to_file(series, fname):
    series.to_csv('{}.txt'.format(fname), index=False)

In [41]:
series_to_file(data.question, 'question')
series_to_file(data.answer, 'answer')

  This is separate from the ipykernel package so we can avoid doing imports until


In [42]:
print(checkpoint)
print(voc_fname)
print(config_fname)

gs://ekaba-assets/biomedbert_large_bert_weights_and_vocab/model.ckpt-1000000
gs://ekaba-assets/biomedbert_large_bert_weights_and_vocab/vocab.txt
gs://ekaba-assets/biomedbert_large_bert_weights_and_vocab/large_bert_config.json


In [43]:
file_name = os.path.relpath('answer.txt')
output_dir = 'output_{}.jsonl'.format(file_name.split('.')[0])
print(file_name)
print(output_dir)

answer.txt
output_answer.jsonl


In [None]:
!python3 bert/extract_features.py \
--input_file={file_name} \
--output_file={output_dir} \
--vocab_file={voc_fname} \
--bert_config_file={config_fname} \
--init_checkpoint={checkpoint} \
--layers=-1 \ #,-2,-3,-4 \
--max_seq_length=128 \
--batch_size=8

### Format outout JSONl

In [45]:
with open(output_dir) as f:
    data = f.read()

In [46]:
def get_sent_embed(output_jsonl) :
    #We will run the model and get the outputs
    json_lines = output_jsonl.split('\n')
    
    #Removing the blank strings
    json_lines =  list(filter(None,json_lines))
    
    #getting the dimensions & getting the output of the query
    line_q = json.loads(json_lines[0])
    embed_size = len(line_q['features'][0]['layers'][0]['values'])
    
    #Temp list for saving the tokens
    token_temp_q = []
    
    #array for saving the embeddings
    feat_embed_q =  np.array(line_q['features'][0]['layers'][0]['values'])
    
    #Getting the final df
    df_query = pd.DataFrame()
    
    for j,feature in enumerate(line_q['features']):
        token_temp_q.append(feature['token'])


    #final_output_embeddings
    tokens_query = ' '.join(token_temp_q[1:len(token_temp_q)-1])

    #final query dataframe
    df_query['documents'] = [tokens_query]
    df_query['embedding'] = [feat_embed_q]
    
    
    #--------------------------------------- answers ----------------------------------------------#
    
    
    #Defining the lists
    sent_embed = []
    tokens = []
    
    #Getting the final df
    df_ans = pd.DataFrame()
    
    #Running for the sentence
    for i in range(1,len(json_lines)):
        line = json.loads(json_lines[i])        
    
        feat_embed = np.array(line['features'][0]['layers'][0]['values'])
        
        #Temp list for saving the tokens
        token_temp = []
        
        for j,feature in enumerate(line['features']):
            token_temp.append(feature['token'])
            
        
        #sanity checks
        if feat_embed.sum() == 0 :
            print ('Check_model')
        
        #final_output_embeddings
        sent_embed.append(feat_embed)
        tokens.append(' '.join(token_temp[1:len(token_temp)-1]))
        
         
        
    df_ans['documents'] = tokens
    df_ans['embedding'] = sent_embed
    
    return df_query, df_ans

In [47]:
%%time
df_query, df_ans = get_sent_embed(data)

CPU times: user 1min 13s, sys: 1.66 s, total: 1min 15s
Wall time: 1min 15s


In [48]:
df_query

Unnamed: 0,documents,embedding
0,balanced q q t ##p breast cancer patient l ##i...,"[0.506378, 0.714099, -0.343656, -1.102875, -1...."


In [49]:
df_ans.head()

Unnamed: 0,documents,embedding
0,genetic modeling l ##i f ##ra ##ume ##ni syndr...,"[-0.476865, 0.976585, 0.59813, -0.45718, -0.93..."
1,clinical perspective a ##fa ##tin ##ib non sma...,"[0.032693, 1.321281, 0.742127, -1.232175, -0.8..."
2,do ##ca sensitive pen ##dr ##in expression kid...,"[-0.034027, 1.207761, 1.166138, -0.48594, -1.0..."
3,clinical molecular characteristics pen ##dre #...,"[-0.59723, 1.333176, 0.339485, -0.107071, -0.8..."
4,pen ##dre ##d syndrome t ##uni ##sia objective...,"[-0.020153, -0.491783, -0.226507, -0.491128, -..."


In [50]:
len(df_ans.embedding[0])

1024

In [51]:
df_ans.to_pickle('answer.pkl')