# Extract Embeddings from Input

In [29]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from numpy.linalg import norm
from invoke import run, exceptions

### Extract Embeddings

In [2]:
bucket_name = 'ekaba-assets'
model_dir = 'biomedbert_base_bert_weights_and_vocab'
config = 'bert_config.json'
vocab = 'vocab.txt'

In [22]:
checkpoint = tf.train.latest_checkpoint('gs://{}/{}'.format(bucket_name, model_dir))
voc_fname = 'gs://{}/{}/{}'.format(bucket_name, model_dir, vocab)
config_fname = 'gs://{}/{}/{}'.format(bucket_name, model_dir, config)
file_name = os.path.relpath('input_answers.txt')
output_dir = 'output_{}.jsonl'.format(file_name.split('.')[0])

In [23]:
print(checkpoint)
print(voc_fname)
print(config_fname)
print(file_name)
print(output_dir)

gs://ekaba-assets/biomedbert_base_bert_weights_and_vocab/model.ckpt-717000
gs://ekaba-assets/biomedbert_base_bert_weights_and_vocab/vocab.txt
gs://ekaba-assets/biomedbert_base_bert_weights_and_vocab/bert_config.json
input_answers.txt
output_input_answers.jsonl


In [None]:
!python3 bert/extract_features.py \
--input_file={file_name} \
--output_file={output_dir} \
--vocab_file={voc_fname} \
--bert_config_file={config_fname} \
--init_checkpoint={checkpoint} \
--layers=-1 \ #,-2,-3,-4 \
--max_seq_length=128 \
--batch_size=8

### Format outout JSONl

In [15]:
with open(output_dir) as f:
    data = f.read()

In [30]:
def get_sent_embed(output_jsonl) :
    #We will run the model and get the outputs
    json_lines = output_jsonl.split('\n')
    
    #Removing the blank strings
    json_lines =  list(filter(None,json_lines))
    
    #getting the dimensions & getting the output of the query
    line_q = json.loads(json_lines[0])
    embed_size = len(line_q['features'][0]['layers'][0]['values'])
    
    #Temp list for saving the tokens
    token_temp_q = []
    
    #array for saving the embeddings
    feat_embed_q =  np.array(line_q['features'][0]['layers'][0]['values'])
    
    #Getting the final df
    df_query = pd.DataFrame()
    
    for j,feature in enumerate(line_q['features']):
        token_temp_q.append(feature['token'])


    #final_output_embeddings
    tokens_query = ' '.join(token_temp_q[1:len(token_temp_q)-1])

    #final query dataframe
    df_query['documents'] = [tokens_query]
    df_query['embedding'] = [feat_embed_q]
    
    
    #--------------------------------------- answers ----------------------------------------------#
    
    
    #Defining the lists
    sent_embed = []
    tokens = []
    
    #Getting the final df
    df_ans = pd.DataFrame()
    
    #Running for the sentence
    for i in range(1,len(json_lines)):
        line = json.loads(json_lines[i])        
    
        feat_embed = np.array(line['features'][0]['layers'][0]['values'])
        
        #Temp list for saving the tokens
        token_temp = []
        
        for j,feature in enumerate(line['features']):
            token_temp.append(feature['token'])
            
        
        #sanity checks
        if feat_embed.sum() == 0 :
            print ('Check_model')
        
        #final_output_embeddings
        sent_embed.append(feat_embed)
        tokens.append(' '.join(token_temp[1:len(token_temp)-1]))
        
         
        
    df_ans['documents'] = tokens
    df_ans['embedding'] = sent_embed
    
    return df_query, df_ans

In [31]:
df_query, df_ans = get_sent_embed(data)

In [None]:
df_query